Merge pull request #7219 from neondatabase/rc/2024-03-25

Release 2024-03-25
2026-01-16 09:52:54 +00:00 · 2024-03-25 12:28:09 +00:00
parent 7b860b837c 3220f830b7
commit 0d3e499059
135 changed files with 5689 additions and 2311 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1121,10 +1121,16 @@ jobs:
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
-
-            # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
+              -f deployPgSniRouter=false \
+              -f deployProxy=false \
+              -f deployStorage=true \
+              -f deployStorageBroker=true \
+              -f branch=main \
+              -f dockerTag=${{needs.tag.outputs.build-tag}} \
+              -f deployPreprodRegion=true
+
            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
              -f deployPgSniRouter=false \
              -f deployProxy=false \
@@ -1133,6 +1139,15 @@ jobs:
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}}
          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
+              -f deployPgSniRouter=true \
+              -f deployProxy=true \
+              -f deployStorage=false \
+              -f deployStorageBroker=false \
+              -f branch=main \
+              -f dockerTag=${{needs.tag.outputs.build-tag}} \
+              -f deployPreprodRegion=true
+
            gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
              -f deployPgSniRouter=true \
              -f deployProxy=true \
--- a/3
+++ b/3
@@ -1,12 +1,13 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
 /control_plane/attachment_service @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute
+/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/safekeepers
 /libs/vm_monitor/ @neondatabase/autoscaling
 /pageserver/ @neondatabase/storage
 /pgxn/ @neondatabase/compute
+/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
 /proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/safekeepers
 /vendor/ @neondatabase/compute
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,6 +277,7 @@ dependencies = [
 "anyhow",
 "aws-config",
 "aws-sdk-secretsmanager",
+ "bytes",
 "camino",
 "clap",
 "control_plane",
@@ -288,6 +289,8 @@ dependencies = [
 "hex",
 "humantime",
 "hyper",
+ "lasso",
+ "measured",
 "metrics",
 "once_cell",
 "pageserver_api",
@@ -295,6 +298,7 @@ dependencies = [
 "postgres_connection",
 "r2d2",
 "reqwest",
+ "routerify",
 "serde",
 "serde_json",
 "thiserror",
@@ -343,9 +347,9 @@ dependencies = [

 [[package]]
 name = "aws-credential-types"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33cc49dcdd31c8b6e79850a179af4c367669150c7ac0135f176c61bec81a70f7"
+checksum = "fa8587ae17c8e967e4b05a62d495be2fb7701bec52a97f7acfe8a29f938384c8"
 dependencies = [
 "aws-smithy-async",
 "aws-smithy-runtime-api",
@@ -355,9 +359,9 @@ dependencies = [

 [[package]]
 name = "aws-runtime"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb031bff99877c26c28895766f7bb8484a05e24547e370768d6cc9db514662aa"
+checksum = "b13dc54b4b49f8288532334bba8f87386a40571c47c37b1304979b556dc613c8"
 dependencies = [
 "aws-credential-types",
 "aws-sigv4",
@@ -377,6 +381,29 @@ dependencies = [
 "uuid",
 ]

+[[package]]
+name = "aws-sdk-iam"
+version = "1.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8ae76026bfb1b80a6aed0bb400c1139cd9c0563e26bce1986cd021c6a968c7b"
+dependencies = [
+ "aws-credential-types",
+ "aws-runtime",
+ "aws-smithy-async",
+ "aws-smithy-http",
+ "aws-smithy-json",
+ "aws-smithy-query",
+ "aws-smithy-runtime",
+ "aws-smithy-runtime-api",
+ "aws-smithy-types",
+ "aws-smithy-xml",
+ "aws-types",
+ "http 0.2.9",
+ "once_cell",
+ "regex-lite",
+ "tracing",
+]
+
 [[package]]
 name = "aws-sdk-s3"
 version = "1.14.0"
@@ -498,9 +525,9 @@ dependencies = [

 [[package]]
 name = "aws-sigv4"
-version = "1.1.4"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c371c6b0ac54d4605eb6f016624fb5c7c2925d315fdf600ac1bf21b19d5f1742"
+checksum = "11d6f29688a4be9895c0ba8bef861ad0c0dac5c15e9618b9b7a6c233990fc263"
 dependencies = [
 "aws-credential-types",
 "aws-smithy-eventstream",
@@ -513,7 +540,7 @@ dependencies = [
 "hex",
 "hmac",
 "http 0.2.9",
- "http 1.0.0",
+ "http 1.1.0",
 "once_cell",
 "p256",
 "percent-encoding",
@@ -527,9 +554,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-async"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72ee2d09cce0ef3ae526679b522835d63e75fb427aca5413cd371e490d52dcc6"
+checksum = "d26ea8fa03025b2face2b3038a63525a10891e3d8829901d502e5384a0d8cd46"
 dependencies = [
 "futures-util",
 "pin-project-lite",
@@ -570,9 +597,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-http"
-version = "0.60.4"
+version = "0.60.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dab56aea3cd9e1101a0a999447fb346afb680ab1406cebc44b32346e25b4117d"
+checksum = "3f10fa66956f01540051b0aa7ad54574640f748f9839e843442d99b970d3aff9"
 dependencies = [
 "aws-smithy-eventstream",
 "aws-smithy-runtime-api",
@@ -591,18 +618,18 @@ dependencies = [

 [[package]]
 name = "aws-smithy-json"
-version = "0.60.4"
+version = "0.60.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fd3898ca6518f9215f62678870064398f00031912390efd03f1f6ef56d83aa8e"
+checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6"
 dependencies = [
 "aws-smithy-types",
 ]

 [[package]]
 name = "aws-smithy-query"
-version = "0.60.4"
+version = "0.60.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bda4b1dfc9810e35fba8a620e900522cd1bd4f9578c446e82f49d1ce41d2e9f9"
+checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb"
 dependencies = [
 "aws-smithy-types",
 "urlencoding",
@@ -610,9 +637,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-runtime"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fafdab38f40ad7816e7da5dec279400dd505160780083759f01441af1bbb10ea"
+checksum = "ec81002d883e5a7fd2bb063d6fb51c4999eb55d404f4fff3dd878bf4733b9f01"
 dependencies = [
 "aws-smithy-async",
 "aws-smithy-http",
@@ -635,14 +662,15 @@ dependencies = [

 [[package]]
 name = "aws-smithy-runtime-api"
-version = "1.1.4"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c18276dd28852f34b3bf501f4f3719781f4999a51c7bff1a5c6dc8c4529adc29"
+checksum = "9acb931e0adaf5132de878f1398d83f8677f90ba70f01f65ff87f6d7244be1c5"
 dependencies = [
 "aws-smithy-async",
 "aws-smithy-types",
 "bytes",
 "http 0.2.9",
+ "http 1.1.0",
 "pin-project-lite",
 "tokio",
 "tracing",
@@ -651,9 +679,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-types"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb3e134004170d3303718baa2a4eb4ca64ee0a1c0a7041dca31b38be0fb414f3"
+checksum = "abe14dceea1e70101d38fbf2a99e6a34159477c0fb95e68e05c66bd7ae4c3729"
 dependencies = [
 "base64-simd",
 "bytes",
@@ -674,18 +702,18 @@ dependencies = [

 [[package]]
 name = "aws-smithy-xml"
-version = "0.60.4"
+version = "0.60.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8604a11b25e9ecaf32f9aa56b9fe253c5e2f606a3477f0071e96d3155a5ed218"
+checksum = "872c68cf019c0e4afc5de7753c4f7288ce4b71663212771bf5e4542eb9346ca9"
 dependencies = [
 "xmlparser",
 ]

 [[package]]
 name = "aws-types"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "789bbe008e65636fe1b6dbbb374c40c8960d1232b96af5ff4aec349f9c4accf4"
+checksum = "0dbf2f3da841a8930f159163175cf6a3d16ddde517c1b0fba7aa776822800f40"
 dependencies = [
 "aws-credential-types",
 "aws-smithy-async",
@@ -2392,9 +2420,9 @@ dependencies = [

 [[package]]
 name = "http"
-version = "1.0.0"
+version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b32afd38673a8016f7c9ae69e5af41a58f81b1d31689040f2f1959594ce194ea"
+checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258"
 dependencies = [
 "bytes",
 "fnv",
@@ -2494,7 +2522,7 @@ dependencies = [
 "hyper",
 "log",
 "rustls 0.21.9",
- "rustls-native-certs",
+ "rustls-native-certs 0.6.2",
 "tokio",
 "tokio-rustls 0.24.0",
 ]
@@ -2880,6 +2908,35 @@ version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

+[[package]]
+name = "measured"
+version = "0.0.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f"
+dependencies = [
+ "bytes",
+ "hashbrown 0.14.0",
+ "itoa",
+ "lasso",
+ "measured-derive",
+ "memchr",
+ "parking_lot 0.12.1",
+ "rustc-hash",
+ "ryu",
+]
+
+[[package]]
+name = "measured-derive"
+version = "0.0.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.52",
+]
+
 [[package]]
 name = "memchr"
 version = "2.6.4"
@@ -3901,7 +3958,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -3914,7 +3971,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "native-tls",
 "tokio",
@@ -3925,7 +3982,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -3938,12 +3995,13 @@ dependencies = [
 "rand 0.8.5",
 "sha2",
 "stringprep",
+ "tokio",
 ]

 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4165,6 +4223,10 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-trait",
+ "aws-config",
+ "aws-sdk-iam",
+ "aws-sigv4",
+ "aws-types",
 "base64 0.13.1",
 "bstr",
 "bytes",
@@ -4175,6 +4237,7 @@ dependencies = [
 "consumption_metrics",
 "dashmap",
 "env_logger",
+ "fallible-iterator",
 "futures",
 "git-version",
 "hashbrown 0.13.2",
@@ -4182,6 +4245,7 @@ dependencies = [
 "hex",
 "hmac",
 "hostname",
+ "http 1.1.0",
 "humantime",
 "hyper",
 "hyper-tungstenite",
@@ -4225,6 +4289,7 @@ dependencies = [
 "smallvec",
 "smol_str",
 "socket2 0.5.5",
+ "subtle",
 "sync_wrapper",
 "task-local-extensions",
 "thiserror",
@@ -4396,9 +4461,9 @@ dependencies = [

 [[package]]
 name = "redis"
-version = "0.24.0"
+version = "0.25.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd"
+checksum = "71d64e978fd98a0e6b105d066ba4889a7301fca65aeac850a877d8797343feeb"
 dependencies = [
 "async-trait",
 "bytes",
@@ -4407,15 +4472,15 @@ dependencies = [
 "itoa",
 "percent-encoding",
 "pin-project-lite",
- "rustls 0.21.9",
- "rustls-native-certs",
- "rustls-pemfile 1.0.2",
- "rustls-webpki 0.101.7",
+ "rustls 0.22.2",
+ "rustls-native-certs 0.7.0",
+ "rustls-pemfile 2.1.1",
+ "rustls-pki-types",
 "ryu",
 "sha1_smol",
- "socket2 0.4.9",
+ "socket2 0.5.5",
 "tokio",
- "tokio-rustls 0.24.0",
+ "tokio-rustls 0.25.0",
 "tokio-util",
 "url",
 ]
@@ -4844,6 +4909,19 @@ dependencies = [
 "security-framework",
 ]

+[[package]]
+name = "rustls-native-certs"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f1fb85efa936c42c6d5fc28d2629bb51e4b2f4b8a5211e297d599cc5a093792"
+dependencies = [
+ "openssl-probe",
+ "rustls-pemfile 2.1.1",
+ "rustls-pki-types",
+ "schannel",
+ "security-framework",
+]
+
 [[package]]
 name = "rustls-pemfile"
 version = "1.0.2"
@@ -5346,13 +5424,23 @@ checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"

 [[package]]
 name = "sha2"
-version = "0.10.6"
+version = "0.10.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0"
+checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
 dependencies = [
 "cfg-if",
 "cpufeatures",
 "digest",
+ "sha2-asm",
+]
+
+[[package]]
+name = "sha2-asm"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f27ba7066011e3fb30d808b51affff34f0a66d3a03a58edd787c6e420e40e44e"
+dependencies = [
+ "cc",
 ]

 [[package]]
@@ -5935,7 +6023,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -6101,7 +6189,7 @@ dependencies = [
 "percent-encoding",
 "pin-project",
 "prost",
- "rustls-native-certs",
+ "rustls-native-certs 0.6.2",
 "rustls-pemfile 1.0.2",
 "tokio",
 "tokio-rustls 0.24.0",
@@ -6468,6 +6556,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "arc-swap",
+ "async-compression",
 "async-trait",
 "bincode",
 "byteorder",
@@ -6506,12 +6595,14 @@ dependencies = [
 "thiserror",
 "tokio",
 "tokio-stream",
+ "tokio-tar",
 "tokio-util",
 "tracing",
 "tracing-error",
 "tracing-subscriber",
 "url",
 "uuid",
+ "walkdir",
 "workspace_hack",
 ]

@@ -6983,7 +7074,6 @@ dependencies = [
 "aws-sigv4",
 "aws-smithy-async",
 "aws-smithy-http",
- "aws-smithy-runtime-api",
 "aws-smithy-types",
 "axum",
 "base64 0.21.1",
@@ -7029,6 +7119,7 @@ dependencies = [
 "scopeguard",
 "serde",
 "serde_json",
+ "sha2",
 "smallvec",
 "subtle",
 "syn 1.0.109",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -53,9 +53,12 @@ async-trait = "0.1"
 aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "1.14"
 aws-sdk-secretsmanager = { version = "1.14.0" }
+aws-sdk-iam = "1.15.0"
 aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
 aws-smithy-types = "1.1.4"
 aws-credential-types = "1.1.4"
+aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
+aws-types = "1.1.7"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
@@ -76,6 +79,7 @@ either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
 fail = "0.5.0"
+fallible-iterator = "0.2"
 fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
@@ -88,6 +92,7 @@ hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
+http = {version = "1.1.0", features = ["std"]}
 http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
@@ -101,6 +106,7 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
+measured = { version = "0.0.13", features=["default", "lasso"] }
 memoffset = "0.8"
 native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
@@ -120,7 +126,7 @@ procfs = "0.14"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
-redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
+redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
@@ -148,6 +154,7 @@ smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
+"subtle"  = "2.5.0"
 svg_fmt = "0.4.1"
 sync_wrapper = "0.1.2"
 tar = "0.4"
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -135,7 +135,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.76.0
+ENV RUSTC_VERSION=1.77.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
@@ -149,7 +149,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    cargo install --git https://github.com/paritytech/cachepot && \
    cargo install rustfilt && \
    cargo install cargo-hakari && \
-    cargo install cargo-deny && \
+    cargo install cargo-deny --locked && \
    cargo install cargo-hack && \
    cargo install cargo-nextest && \
    rm -rf /home/nonroot/.cargo/registry && \
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -32,6 +32,29 @@ compute_ctl -D /var/db/postgres/compute \
            -b /usr/local/bin/postgres
 ```

+## State Diagram
+
+Computes can be in various states. Below is a diagram that details how a
+compute moves between states.
+
+```mermaid
+%% https://mermaid.js.org/syntax/stateDiagram.html
+stateDiagram-v2
+  [*] --> Empty : Compute spawned
+  Empty --> ConfigurationPending : Waiting for compute spec
+  ConfigurationPending --> Configuration : Received compute spec
+  Configuration --> Failed : Failed to configure the compute
+  Configuration --> Running : Compute has been configured
+  Empty --> Init : Compute spec is immediately available
+  Empty --> TerminationPending : Requested termination
+  Init --> Failed : Failed to start Postgres
+  Init --> Running : Started Postgres
+  Running --> TerminationPending : Requested termination
+  TerminationPending --> Terminated : Terminated compute
+  Failed --> [*] : Compute exited
+  Terminated --> [*] : Compute exited
+```
+
 ## Tests

 Cargo formatter:
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -17,6 +17,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
        .write(true)
        .create(true)
        .append(false)
+        .truncate(false)
        .open(path)?;
    let buf = io::BufReader::new(&file);
    let mut count: usize = 0;
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -17,6 +17,7 @@ testing = []
 anyhow.workspace = true
 aws-config.workspace = true
 aws-sdk-secretsmanager.workspace = true
+bytes.workspace = true
 camino.workspace = true
 clap.workspace = true
 fail.workspace = true
@@ -25,17 +26,20 @@ git-version.workspace = true
 hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
+lasso.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
 reqwest.workspace = true
+routerify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
+measured.workspace = true

 diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
--- a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
+++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
@@ -0,0 +1,3 @@
+
+UPDATE tenant_shards set placement_policy='{"Double": 1}' where placement_policy='{"Attached": 1}';
+UPDATE tenant_shards set placement_policy='"Single"' where placement_policy='{"Attached": 0}';
--- a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql
+++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql
@@ -0,0 +1,3 @@
+
+UPDATE tenant_shards set placement_policy='{"Attached": 1}' where placement_policy='{"Double": 1}';
+UPDATE tenant_shards set placement_policy='{"Attached": 0}' where placement_policy='"Single"';
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,5 +1,11 @@
+use crate::metrics::{
+    HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
+    METRICS_REGISTRY,
+};
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
+use futures::Future;
+use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
@@ -34,6 +40,8 @@ use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};

 use control_plane::storage_controller::{AttachHookRequest, InspectRequest};

+use routerify::Middleware;
+
 /// State available to HTTP request handlers
 #[derive(Clone)]
 pub struct HttpState {
@@ -313,7 +321,7 @@ async fn handle_tenant_timeline_passthrough(
    tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);

    // Find the node that holds shard zero
-    let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;
+    let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id)?;

    // Callers will always pass an unsharded tenant ID.  Before proxying, we must
    // rewrite this to a shard-aware shard zero ID.
@@ -322,12 +330,39 @@ async fn handle_tenant_timeline_passthrough(
    let tenant_shard_str = format!("{}", tenant_shard_id);
    let path = path.replace(&tenant_str, &tenant_shard_str);

-    let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
+    let latency = &METRICS_REGISTRY
+        .metrics_group
+        .storage_controller_passthrough_request_latency;
+
+    // This is a bit awkward. We remove the param from the request
+    // and join the words by '_' to get a label for the request.
+    let just_path = path.replace(&tenant_shard_str, "");
+    let path_label = just_path
+        .split('/')
+        .filter(|token| !token.is_empty())
+        .collect::<Vec<_>>()
+        .join("_");
+    let labels = PageserverRequestLabelGroup {
+        pageserver_id: &node.get_id().to_string(),
+        path: &path_label,
+        method: crate::metrics::Method::Get,
+    };
+
+    let _timer = latency.start_timer(labels.clone());
+
+    let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref());
    let resp = client.get_raw(path).await.map_err(|_e|
        // FIXME: give APiError a proper Unavailable variant.  We return 503 here because
        // if we can't successfully send a request to the pageserver, we aren't available.
        ApiError::ShuttingDown)?;

+    if !resp.status().is_success() {
+        let error_counter = &METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_passthrough_request_error;
+        error_counter.inc(labels);
+    }
+
    // We have a reqest::Response, would like a http::Response
    let mut builder = hyper::Response::builder()
        .status(resp.status())
@@ -353,6 +388,16 @@ async fn handle_tenant_locate(
    json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }

+async fn handle_tenant_describe(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
+}
+
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -488,7 +533,11 @@ impl From<ReconcileError> for ApiError {

 /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
 /// be allowed to run if Service has finished its initial reconciliation.
-async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
+async fn tenant_service_handler<R, H>(
+    request: Request<Body>,
+    handler: H,
+    request_name: RequestName,
+) -> R::Output
 where
    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
    H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
@@ -508,9 +557,10 @@ where
        ));
    }

-    request_span(
+    named_request_span(
        request,
        |request| async move { handler(service, request).await },
+        request_name,
    )
    .await
 }
@@ -521,11 +571,98 @@ fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(
    })
 }

+#[derive(Clone, Debug)]
+struct RequestMeta {
+    method: hyper::http::Method,
+    at: Instant,
+}
+
+fn prologue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
+) -> Middleware<B, ApiError> {
+    Middleware::pre(move |req| async move {
+        let meta = RequestMeta {
+            method: req.method().clone(),
+            at: Instant::now(),
+        };
+
+        req.set_context(meta);
+
+        Ok(req)
+    })
+}
+
+fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
+) -> Middleware<B, ApiError> {
+    Middleware::post_with_info(move |resp, req_info| async move {
+        let request_name = match req_info.context::<RequestName>() {
+            Some(name) => name,
+            None => {
+                return Ok(resp);
+            }
+        };
+
+        if let Some(meta) = req_info.context::<RequestMeta>() {
+            let status = &crate::metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_http_request_status;
+            let latency = &crate::metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_http_request_latency;
+
+            status.inc(HttpRequestStatusLabelGroup {
+                path: request_name.0,
+                method: meta.method.clone().into(),
+                status: crate::metrics::StatusCode(resp.status()),
+            });
+
+            latency.observe(
+                HttpRequestLatencyLabelGroup {
+                    path: request_name.0,
+                    method: meta.method.into(),
+                },
+                meta.at.elapsed().as_secs_f64(),
+            );
+        }
+        Ok(resp)
+    })
+}
+
+pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
+
+    let payload = crate::metrics::METRICS_REGISTRY.encode();
+    let response = Response::builder()
+        .status(200)
+        .header(CONTENT_TYPE, TEXT_FORMAT)
+        .body(payload.into())
+        .unwrap();
+
+    Ok(response)
+}
+
+#[derive(Clone)]
+struct RequestName(&'static str);
+
+async fn named_request_span<R, H>(
+    request: Request<Body>,
+    handler: H,
+    name: RequestName,
+) -> R::Output
+where
+    R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
+    H: FnOnce(Request<Body>) -> R + Send + Sync + 'static,
+{
+    request.set_context(name);
+    request_span(request, handler).await
+}
+
 pub fn make_router(
    service: Arc<Service>,
    auth: Option<Arc<SwappableJwtAuth>>,
 ) -> RouterBuilder<hyper::Body, ApiError> {
-    let mut router = endpoint::make_router();
+    let mut router = endpoint::make_router()
+        .middleware(prologue_metrics_middleware())
+        .middleware(epilogue_metrics_middleware());
    if auth.is_some() {
        router = router.middleware(auth_middleware(|request| {
            let state = get_state(request);
@@ -534,96 +671,166 @@ pub fn make_router(
            } else {
                state.auth.as_deref()
            }
-        }))
+        }));
    }

    router
        .data(Arc::new(HttpState::new(service, auth)))
+        .get("/metrics", |r| {
+            named_request_span(r, measured_metrics_handler, RequestName("metrics"))
+        })
        // Non-prefixed generic endpoints (status, metrics)
-        .get("/status", |r| request_span(r, handle_status))
-        .get("/ready", |r| request_span(r, handle_ready))
+        .get("/status", |r| {
+            named_request_span(r, handle_status, RequestName("status"))
+        })
+        .get("/ready", |r| {
+            named_request_span(r, handle_ready, RequestName("ready"))
+        })
        // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
        .post("/upcall/v1/re-attach", |r| {
-            request_span(r, handle_re_attach)
+            named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach"))
+        })
+        .post("/upcall/v1/validate", |r| {
+            named_request_span(r, handle_validate, RequestName("upcall_v1_validate"))
        })
-        .post("/upcall/v1/validate", |r| request_span(r, handle_validate))
        // Test/dev/debug endpoints
        .post("/debug/v1/attach-hook", |r| {
-            request_span(r, handle_attach_hook)
+            named_request_span(r, handle_attach_hook, RequestName("debug_v1_attach_hook"))
+        })
+        .post("/debug/v1/inspect", |r| {
+            named_request_span(r, handle_inspect, RequestName("debug_v1_inspect"))
        })
-        .post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
        .post("/debug/v1/tenant/:tenant_id/drop", |r| {
-            request_span(r, handle_tenant_drop)
+            named_request_span(r, handle_tenant_drop, RequestName("debug_v1_tenant_drop"))
        })
        .post("/debug/v1/node/:node_id/drop", |r| {
-            request_span(r, handle_node_drop)
+            named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
+        })
+        .get("/debug/v1/tenant", |r| {
+            named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
+        })
+        .get("/debug/v1/tenant/:tenant_id/locate", |r| {
+            tenant_service_handler(
+                r,
+                handle_tenant_locate,
+                RequestName("debug_v1_tenant_locate"),
+            )
        })
-        .get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
        .get("/debug/v1/scheduler", |r| {
-            request_span(r, handle_scheduler_dump)
+            named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
        })
        .post("/debug/v1/consistency_check", |r| {
-            request_span(r, handle_consistency_check)
+            named_request_span(
+                r,
+                handle_consistency_check,
+                RequestName("debug_v1_consistency_check"),
+            )
        })
        .put("/debug/v1/failpoints", |r| {
            request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
        })
-        .get("/control/v1/tenant/:tenant_id/locate", |r| {
-            tenant_service_handler(r, handle_tenant_locate)
-        })
        // Node operations
        .post("/control/v1/node", |r| {
-            request_span(r, handle_node_register)
+            named_request_span(r, handle_node_register, RequestName("control_v1_node"))
+        })
+        .get("/control/v1/node", |r| {
+            named_request_span(r, handle_node_list, RequestName("control_v1_node"))
        })
-        .get("/control/v1/node", |r| request_span(r, handle_node_list))
        .put("/control/v1/node/:node_id/config", |r| {
-            request_span(r, handle_node_configure)
+            named_request_span(
+                r,
+                handle_node_configure,
+                RequestName("control_v1_node_config"),
+            )
        })
        // Tenant Shard operations
        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
-            tenant_service_handler(r, handle_tenant_shard_migrate)
+            tenant_service_handler(
+                r,
+                handle_tenant_shard_migrate,
+                RequestName("control_v1_tenant_migrate"),
+            )
        })
        .put("/control/v1/tenant/:tenant_id/shard_split", |r| {
-            tenant_service_handler(r, handle_tenant_shard_split)
+            tenant_service_handler(
+                r,
+                handle_tenant_shard_split,
+                RequestName("control_v1_tenant_shard_split"),
+            )
+        })
+        .get("/control/v1/tenant/:tenant_id", |r| {
+            tenant_service_handler(
+                r,
+                handle_tenant_describe,
+                RequestName("control_v1_tenant_describe"),
+            )
        })
        // Tenant operations
        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
        .post("/v1/tenant", |r| {
-            tenant_service_handler(r, handle_tenant_create)
+            tenant_service_handler(r, handle_tenant_create, RequestName("v1_tenant"))
        })
        .delete("/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(r, handle_tenant_delete)
+            tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant"))
        })
        .put("/v1/tenant/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_set)
+            tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config"))
        })
        .get("/v1/tenant/:tenant_id/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_get)
+            tenant_service_handler(r, handle_tenant_config_get, RequestName("v1_tenant_config"))
        })
        .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
-            tenant_service_handler(r, handle_tenant_location_config)
+            tenant_service_handler(
+                r,
+                handle_tenant_location_config,
+                RequestName("v1_tenant_location_config"),
+            )
        })
        .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
-            tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
+            tenant_service_handler(
+                r,
+                handle_tenant_time_travel_remote_storage,
+                RequestName("v1_tenant_time_travel_remote_storage"),
+            )
        })
        .post("/v1/tenant/:tenant_id/secondary/download", |r| {
-            tenant_service_handler(r, handle_tenant_secondary_download)
+            tenant_service_handler(
+                r,
+                handle_tenant_secondary_download,
+                RequestName("v1_tenant_secondary_download"),
+            )
        })
        // Timeline operations
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            tenant_service_handler(r, handle_tenant_timeline_delete)
+            tenant_service_handler(
+                r,
+                handle_tenant_timeline_delete,
+                RequestName("v1_tenant_timeline"),
+            )
        })
        .post("/v1/tenant/:tenant_id/timeline", |r| {
-            tenant_service_handler(r, handle_tenant_timeline_create)
+            tenant_service_handler(
+                r,
+                handle_tenant_timeline_create,
+                RequestName("v1_tenant_timeline"),
+            )
        })
        // Tenant detail GET passthrough to shard zero
        .get("/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(r, handle_tenant_timeline_passthrough)
+            tenant_service_handler(
+                r,
+                handle_tenant_timeline_passthrough,
+                RequestName("v1_tenant_passthrough"),
+            )
        })
        // Timeline GET passthrough to shard zero.  Note that the `*` in the URL is a wildcard: any future
        // timeline GET APIs will be implicitly included.
        .get("/v1/tenant/:tenant_id/timeline*", |r| {
-            tenant_service_handler(r, handle_tenant_timeline_passthrough)
+            tenant_service_handler(
+                r,
+                handle_tenant_timeline_passthrough,
+                RequestName("v1_tenant_timeline_passthrough"),
+            )
        })
 }
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -8,6 +8,7 @@ pub mod http;
 mod id_lock_map;
 pub mod metrics;
 mod node;
+mod pageserver_client;
 pub mod persistence;
 mod reconciler;
 mod scheduler;
--- a/control_plane/attachment_service/src/metrics.rs
+++ b/control_plane/attachment_service/src/metrics.rs
@@ -1,32 +1,284 @@
-use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
+//!
+//! This module provides metric definitions for the storage controller.
+//!
+//! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds
+//! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`]
+//! constant.
+//!
+//! The rest of the code defines label group types and deals with converting outer types to labels.
+//!
+use bytes::Bytes;
+use measured::{
+    label::{LabelValue, StaticLabelSet},
+    FixedCardinalityLabel, MetricGroup,
+};
 use once_cell::sync::Lazy;
+use std::sync::Mutex;

-pub(crate) struct ReconcilerMetrics {
-    pub(crate) spawned: IntCounter,
-    pub(crate) complete: IntCounterVec,
-}
+use crate::persistence::{DatabaseError, DatabaseOperation};

-impl ReconcilerMetrics {
-    // Labels used on [`Self::complete`]
-    pub(crate) const SUCCESS: &'static str = "ok";
-    pub(crate) const ERROR: &'static str = "success";
-    pub(crate) const CANCEL: &'static str = "cancel";
-}
-
-pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
-    spawned: register_int_counter!(
-        "storage_controller_reconcile_spawn",
-        "Count of how many times we spawn a reconcile task",
-    )
-    .expect("failed to define a metric"),
-    complete: register_int_counter_vec!(
-        "storage_controller_reconcile_complete",
-        "Reconciler tasks completed, broken down by success/failure/cancelled",
-        &["status"],
-    )
-    .expect("failed to define a metric"),
-});
+pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
+    Lazy::new(StorageControllerMetrics::default);

 pub fn preinitialize_metrics() {
-    Lazy::force(&RECONCILER);
+    Lazy::force(&METRICS_REGISTRY);
+}
+
+pub(crate) struct StorageControllerMetrics {
+    pub(crate) metrics_group: StorageControllerMetricGroup,
+    encoder: Mutex<measured::text::TextEncoder>,
+}
+
+#[derive(measured::MetricGroup)]
+pub(crate) struct StorageControllerMetricGroup {
+    /// Count of how many times we spawn a reconcile task
+    pub(crate) storage_controller_reconcile_spawn: measured::Counter,
+    /// Reconciler tasks completed, broken down by success/failure/cancelled
+    pub(crate) storage_controller_reconcile_complete:
+        measured::CounterVec<ReconcileCompleteLabelGroupSet>,
+
+    /// HTTP request status counters for handled requests
+    pub(crate) storage_controller_http_request_status:
+        measured::CounterVec<HttpRequestStatusLabelGroupSet>,
+    /// HTTP request handler latency across all status codes
+    pub(crate) storage_controller_http_request_latency:
+        measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
+
+    /// Count of HTTP requests to the pageserver that resulted in an error,
+    /// broken down by the pageserver node id, request name and method
+    pub(crate) storage_controller_pageserver_request_error:
+        measured::CounterVec<PageserverRequestLabelGroupSet>,
+
+    /// Latency of HTTP requests to the pageserver, broken down by pageserver
+    /// node id, request name and method. This include both successful and unsuccessful
+    /// requests.
+    pub(crate) storage_controller_pageserver_request_latency:
+        measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
+
+    /// Count of pass-through HTTP requests to the pageserver that resulted in an error,
+    /// broken down by the pageserver node id, request name and method
+    pub(crate) storage_controller_passthrough_request_error:
+        measured::CounterVec<PageserverRequestLabelGroupSet>,
+
+    /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
+    /// node id, request name and method. This include both successful and unsuccessful
+    /// requests.
+    pub(crate) storage_controller_passthrough_request_latency:
+        measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
+
+    /// Count of errors in database queries, broken down by error type and operation.
+    pub(crate) storage_controller_database_query_error:
+        measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
+
+    /// Latency of database queries, broken down by operation.
+    pub(crate) storage_controller_database_query_latency:
+        measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
+}
+
+impl StorageControllerMetrics {
+    pub(crate) fn encode(&self) -> Bytes {
+        let mut encoder = self.encoder.lock().unwrap();
+        self.metrics_group.collect_into(&mut *encoder);
+        encoder.finish()
+    }
+}
+
+impl Default for StorageControllerMetrics {
+    fn default() -> Self {
+        Self {
+            metrics_group: StorageControllerMetricGroup::new(),
+            encoder: Mutex::new(measured::text::TextEncoder::new()),
+        }
+    }
+}
+
+impl StorageControllerMetricGroup {
+    pub(crate) fn new() -> Self {
+        Self {
+            storage_controller_reconcile_spawn: measured::Counter::new(),
+            storage_controller_reconcile_complete: measured::CounterVec::new(
+                ReconcileCompleteLabelGroupSet {
+                    status: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_http_request_status: measured::CounterVec::new(
+                HttpRequestStatusLabelGroupSet {
+                    path: lasso::ThreadedRodeo::new(),
+                    method: StaticLabelSet::new(),
+                    status: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_http_request_latency: measured::HistogramVec::new(
+                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
+            ),
+            storage_controller_pageserver_request_error: measured::CounterVec::new(
+                PageserverRequestLabelGroupSet {
+                    pageserver_id: lasso::ThreadedRodeo::new(),
+                    path: lasso::ThreadedRodeo::new(),
+                    method: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_pageserver_request_latency: measured::HistogramVec::new(
+                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
+            ),
+            storage_controller_passthrough_request_error: measured::CounterVec::new(
+                PageserverRequestLabelGroupSet {
+                    pageserver_id: lasso::ThreadedRodeo::new(),
+                    path: lasso::ThreadedRodeo::new(),
+                    method: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_passthrough_request_latency: measured::HistogramVec::new(
+                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
+            ),
+            storage_controller_database_query_error: measured::CounterVec::new(
+                DatabaseQueryErrorLabelGroupSet {
+                    operation: StaticLabelSet::new(),
+                    error_type: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_database_query_latency: measured::HistogramVec::new(
+                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
+            ),
+        }
+    }
+}
+
+#[derive(measured::LabelGroup)]
+#[label(set = ReconcileCompleteLabelGroupSet)]
+pub(crate) struct ReconcileCompleteLabelGroup {
+    pub(crate) status: ReconcileOutcome,
+}
+
+#[derive(measured::LabelGroup)]
+#[label(set = HttpRequestStatusLabelGroupSet)]
+pub(crate) struct HttpRequestStatusLabelGroup<'a> {
+    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    pub(crate) path: &'a str,
+    pub(crate) method: Method,
+    pub(crate) status: StatusCode,
+}
+
+#[derive(measured::LabelGroup)]
+#[label(set = HttpRequestLatencyLabelGroupSet)]
+pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
+    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    pub(crate) path: &'a str,
+    pub(crate) method: Method,
+}
+
+impl Default for HttpRequestLatencyLabelGroupSet {
+    fn default() -> Self {
+        Self {
+            path: lasso::ThreadedRodeo::new(),
+            method: StaticLabelSet::new(),
+        }
+    }
+}
+
+#[derive(measured::LabelGroup, Clone)]
+#[label(set = PageserverRequestLabelGroupSet)]
+pub(crate) struct PageserverRequestLabelGroup<'a> {
+    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    pub(crate) pageserver_id: &'a str,
+    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    pub(crate) path: &'a str,
+    pub(crate) method: Method,
+}
+
+impl Default for PageserverRequestLabelGroupSet {
+    fn default() -> Self {
+        Self {
+            pageserver_id: lasso::ThreadedRodeo::new(),
+            path: lasso::ThreadedRodeo::new(),
+            method: StaticLabelSet::new(),
+        }
+    }
+}
+
+#[derive(measured::LabelGroup)]
+#[label(set = DatabaseQueryErrorLabelGroupSet)]
+pub(crate) struct DatabaseQueryErrorLabelGroup {
+    pub(crate) error_type: DatabaseErrorLabel,
+    pub(crate) operation: DatabaseOperation,
+}
+
+#[derive(measured::LabelGroup)]
+#[label(set = DatabaseQueryLatencyLabelGroupSet)]
+pub(crate) struct DatabaseQueryLatencyLabelGroup {
+    pub(crate) operation: DatabaseOperation,
+}
+
+#[derive(FixedCardinalityLabel)]
+pub(crate) enum ReconcileOutcome {
+    #[label(rename = "ok")]
+    Success,
+    Error,
+    Cancel,
+}
+
+#[derive(FixedCardinalityLabel, Clone)]
+pub(crate) enum Method {
+    Get,
+    Put,
+    Post,
+    Delete,
+    Other,
+}
+
+impl From<hyper::Method> for Method {
+    fn from(value: hyper::Method) -> Self {
+        if value == hyper::Method::GET {
+            Method::Get
+        } else if value == hyper::Method::PUT {
+            Method::Put
+        } else if value == hyper::Method::POST {
+            Method::Post
+        } else if value == hyper::Method::DELETE {
+            Method::Delete
+        } else {
+            Method::Other
+        }
+    }
+}
+
+pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
+
+impl LabelValue for StatusCode {
+    fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
+        v.write_int(self.0.as_u16() as u64)
+    }
+}
+
+impl FixedCardinalityLabel for StatusCode {
+    fn cardinality() -> usize {
+        (100..1000).len()
+    }
+
+    fn encode(&self) -> usize {
+        self.0.as_u16() as usize
+    }
+
+    fn decode(value: usize) -> Self {
+        Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap())
+    }
+}
+
+#[derive(FixedCardinalityLabel)]
+pub(crate) enum DatabaseErrorLabel {
+    Query,
+    Connection,
+    ConnectionPool,
+    Logical,
+}
+
+impl DatabaseError {
+    pub(crate) fn error_label(&self) -> DatabaseErrorLabel {
+        match self {
+            Self::Query(_) => DatabaseErrorLabel::Query,
+            Self::Connection(_) => DatabaseErrorLabel::Connection,
+            Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
+            Self::Logical(_) => DatabaseErrorLabel::Logical,
+        }
+    }
 }
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -12,7 +12,9 @@ use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use utils::{backoff, id::NodeId};

-use crate::{persistence::NodePersistence, scheduler::MaySchedule};
+use crate::{
+    pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule,
+};

 /// Represents the in-memory description of a Node.
 ///
@@ -202,7 +204,7 @@ impl Node {
        cancel: &CancellationToken,
    ) -> Option<mgmt_api::Result<T>>
    where
-        O: FnMut(mgmt_api::Client) -> F,
+        O: FnMut(PageserverClient) -> F,
        F: std::future::Future<Output = mgmt_api::Result<T>>,
    {
        fn is_fatal(e: &mgmt_api::Error) -> bool {
@@ -224,8 +226,12 @@ impl Node {
                    .build()
                    .expect("Failed to construct HTTP client");

-                let client =
-                    mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref());
+                let client = PageserverClient::from_client(
+                    self.get_id(),
+                    http_client,
+                    self.base_url(),
+                    jwt.as_deref(),
+                );

                let node_cancel_fut = self.cancel.cancelled();

--- a/control_plane/attachment_service/src/pageserver_client.rs
+++ b/control_plane/attachment_service/src/pageserver_client.rs
@@ -0,0 +1,203 @@
+use pageserver_api::{
+    models::{
+        LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
+        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
+    },
+    shard::TenantShardId,
+};
+use pageserver_client::mgmt_api::{Client, Result};
+use reqwest::StatusCode;
+use utils::id::{NodeId, TimelineId};
+
+/// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
+/// controller to collect metrics in a non-intrusive manner.
+#[derive(Debug, Clone)]
+pub(crate) struct PageserverClient {
+    inner: Client,
+    node_id_label: String,
+}
+
+macro_rules! measured_request {
+    ($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{
+        let labels = crate::metrics::PageserverRequestLabelGroup {
+            pageserver_id: $node_id,
+            path: $name,
+            method: $method,
+        };
+
+        let latency = &crate::metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_pageserver_request_latency;
+        let _timer_guard = latency.start_timer(labels.clone());
+
+        let res = $invoke;
+
+        if res.is_err() {
+            let error_counters = &crate::metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_pageserver_request_error;
+            error_counters.inc(labels)
+        }
+
+        res
+    }};
+}
+
+impl PageserverClient {
+    pub(crate) fn new(node_id: NodeId, mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
+        Self {
+            inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt),
+            node_id_label: node_id.0.to_string(),
+        }
+    }
+
+    pub(crate) fn from_client(
+        node_id: NodeId,
+        raw_client: reqwest::Client,
+        mgmt_api_endpoint: String,
+        jwt: Option<&str>,
+    ) -> Self {
+        Self {
+            inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt),
+            node_id_label: node_id.0.to_string(),
+        }
+    }
+
+    pub(crate) async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result<StatusCode> {
+        measured_request!(
+            "tenant",
+            crate::metrics::Method::Delete,
+            &self.node_id_label,
+            self.inner.tenant_delete(tenant_shard_id).await
+        )
+    }
+
+    pub(crate) async fn tenant_time_travel_remote_storage(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timestamp: &str,
+        done_if_after: &str,
+    ) -> Result<()> {
+        measured_request!(
+            "tenant_time_travel_remote_storage",
+            crate::metrics::Method::Put,
+            &self.node_id_label,
+            self.inner
+                .tenant_time_travel_remote_storage(tenant_shard_id, timestamp, done_if_after)
+                .await
+        )
+    }
+
+    pub(crate) async fn tenant_secondary_download(
+        &self,
+        tenant_id: TenantShardId,
+        wait: Option<std::time::Duration>,
+    ) -> Result<(StatusCode, SecondaryProgress)> {
+        measured_request!(
+            "tenant_secondary_download",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner.tenant_secondary_download(tenant_id, wait).await
+        )
+    }
+
+    pub(crate) async fn location_config(
+        &self,
+        tenant_shard_id: TenantShardId,
+        config: LocationConfig,
+        flush_ms: Option<std::time::Duration>,
+        lazy: bool,
+    ) -> Result<()> {
+        measured_request!(
+            "location_config",
+            crate::metrics::Method::Put,
+            &self.node_id_label,
+            self.inner
+                .location_config(tenant_shard_id, config, flush_ms, lazy)
+                .await
+        )
+    }
+
+    pub(crate) async fn list_location_config(&self) -> Result<LocationConfigListResponse> {
+        measured_request!(
+            "location_configs",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner.list_location_config().await
+        )
+    }
+
+    pub(crate) async fn get_location_config(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<Option<LocationConfig>> {
+        measured_request!(
+            "location_config",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner.get_location_config(tenant_shard_id).await
+        )
+    }
+
+    pub(crate) async fn timeline_create(
+        &self,
+        tenant_shard_id: TenantShardId,
+        req: &TimelineCreateRequest,
+    ) -> Result<TimelineInfo> {
+        measured_request!(
+            "timeline",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner.timeline_create(tenant_shard_id, req).await
+        )
+    }
+
+    pub(crate) async fn timeline_delete(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<StatusCode> {
+        measured_request!(
+            "timeline",
+            crate::metrics::Method::Delete,
+            &self.node_id_label,
+            self.inner
+                .timeline_delete(tenant_shard_id, timeline_id)
+                .await
+        )
+    }
+
+    pub(crate) async fn tenant_shard_split(
+        &self,
+        tenant_shard_id: TenantShardId,
+        req: TenantShardSplitRequest,
+    ) -> Result<TenantShardSplitResponse> {
+        measured_request!(
+            "tenant_shard_split",
+            crate::metrics::Method::Put,
+            &self.node_id_label,
+            self.inner.tenant_shard_split(tenant_shard_id, req).await
+        )
+    }
+
+    pub(crate) async fn timeline_list(
+        &self,
+        tenant_shard_id: &TenantShardId,
+    ) -> Result<Vec<TimelineInfo>> {
+        measured_request!(
+            "timelines",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner.timeline_list(tenant_shard_id).await
+        )
+    }
+
+    pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
+        measured_request!(
+            "utilization",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner.get_utilization().await
+        )
+    }
+}
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -19,6 +19,9 @@ use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};

+use crate::metrics::{
+    DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY,
+};
 use crate::node::Node;

 /// ## What do we store?
@@ -75,6 +78,25 @@ pub(crate) enum DatabaseError {
    Logical(String),
 }

+#[derive(measured::FixedCardinalityLabel, Clone)]
+pub(crate) enum DatabaseOperation {
+    InsertNode,
+    UpdateNode,
+    DeleteNode,
+    ListNodes,
+    BeginShardSplit,
+    CompleteShardSplit,
+    AbortShardSplit,
+    Detach,
+    ReAttach,
+    IncrementGeneration,
+    ListTenantShards,
+    InsertTenantShards,
+    UpdateTenantShard,
+    DeleteTenant,
+    UpdateTenantConfig,
+}
+
 #[must_use]
 pub(crate) enum AbortShardSplitStatus {
    /// We aborted the split in the database by reverting to the parent shards
@@ -115,6 +137,34 @@ impl Persistence {
        }
    }

+    /// Wraps `with_conn` in order to collect latency and error metrics
+    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
+    where
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        R: Send + 'static,
+    {
+        let latency = &METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_database_query_latency;
+        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup {
+            operation: op.clone(),
+        });
+
+        let res = self.with_conn(func).await;
+
+        if let Err(err) = &res {
+            let error_counter = &METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_database_query_error;
+            error_counter.inc(DatabaseQueryErrorLabelGroup {
+                error_type: err.error_label(),
+                operation: op,
+            })
+        }
+
+        res
+    }
+
    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
    where
@@ -130,21 +180,27 @@ impl Persistence {
    /// When a node is first registered, persist it before using it for anything
    pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
        let np = node.to_persistent();
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            diesel::insert_into(crate::schema::nodes::table)
-                .values(&np)
-                .execute(conn)?;
-            Ok(())
-        })
+        self.with_measured_conn(
+            DatabaseOperation::InsertNode,
+            move |conn| -> DatabaseResult<()> {
+                diesel::insert_into(crate::schema::nodes::table)
+                    .values(&np)
+                    .execute(conn)?;
+                Ok(())
+            },
+        )
        .await
    }

    /// At startup, populate the list of nodes which our shards may be placed on
    pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
        let nodes: Vec<NodePersistence> = self
-            .with_conn(move |conn| -> DatabaseResult<_> {
-                Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
-            })
+            .with_measured_conn(
+                DatabaseOperation::ListNodes,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
+                },
+            )
            .await?;

        tracing::info!("list_nodes: loaded {} nodes", nodes.len());
@@ -159,7 +215,7 @@ impl Persistence {
    ) -> DatabaseResult<()> {
        use crate::schema::nodes::dsl::*;
        let updated = self
-            .with_conn(move |conn| {
+            .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
                let updated = diesel::update(nodes)
                    .filter(node_id.eq(input_node_id.0 as i64))
                    .set((scheduling_policy.eq(String::from(input_scheduling)),))
@@ -181,9 +237,12 @@ impl Persistence {
    /// be enriched at runtime with state discovered on pageservers.
    pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
        let loaded = self
-            .with_conn(move |conn| -> DatabaseResult<_> {
-                Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
-            })
+            .with_measured_conn(
+                DatabaseOperation::ListTenantShards,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
+                },
+            )
            .await?;

        if loaded.is_empty() {
@@ -211,15 +270,10 @@ impl Persistence {

        let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
            .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
-        for (tenant_id, tenant) in &mut decoded.tenants {
-            // Backward compat: an old attachments.json from before PR #6251, replace
-            // empty strings with proper defaults.
-            if tenant.tenant_id.is_empty() {
-                tenant.tenant_id = tenant_id.to_string();
-                tenant.config = serde_json::to_string(&TenantConfig::default())
-                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
-                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single)
-                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
+        for shard in decoded.tenants.values_mut() {
+            if shard.placement_policy == "\"Single\"" {
+                // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
+                shard.placement_policy = "{\"Attached\":0}".to_string();
            }
        }

@@ -265,17 +319,20 @@ impl Persistence {
        shards: Vec<TenantShardPersistence>,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            conn.transaction(|conn| -> QueryResult<()> {
-                for tenant in &shards {
-                    diesel::insert_into(tenant_shards)
-                        .values(tenant)
-                        .execute(conn)?;
-                }
+        self.with_measured_conn(
+            DatabaseOperation::InsertTenantShards,
+            move |conn| -> DatabaseResult<()> {
+                conn.transaction(|conn| -> QueryResult<()> {
+                    for tenant in &shards {
+                        diesel::insert_into(tenant_shards)
+                            .values(tenant)
+                            .execute(conn)?;
+                    }
+                    Ok(())
+                })?;
                Ok(())
-            })?;
-            Ok(())
-        })
+            },
+        )
        .await
    }

@@ -283,25 +340,31 @@ impl Persistence {
    /// the tenant from memory on this server.
    pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            diesel::delete(tenant_shards)
-                .filter(tenant_id.eq(del_tenant_id.to_string()))
-                .execute(conn)?;
+        self.with_measured_conn(
+            DatabaseOperation::DeleteTenant,
+            move |conn| -> DatabaseResult<()> {
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(del_tenant_id.to_string()))
+                    .execute(conn)?;

-            Ok(())
-        })
+                Ok(())
+            },
+        )
        .await
    }

    pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
        use crate::schema::nodes::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            diesel::delete(nodes)
-                .filter(node_id.eq(del_node_id.0 as i64))
-                .execute(conn)?;
+        self.with_measured_conn(
+            DatabaseOperation::DeleteNode,
+            move |conn| -> DatabaseResult<()> {
+                diesel::delete(nodes)
+                    .filter(node_id.eq(del_node_id.0 as i64))
+                    .execute(conn)?;

-            Ok(())
-        })
+                Ok(())
+            },
+        )
        .await
    }

@@ -315,7 +378,7 @@ impl Persistence {
    ) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
        use crate::schema::tenant_shards::dsl::*;
        let updated = self
-            .with_conn(move |conn| {
+            .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
                let rows_updated = diesel::update(tenant_shards)
                    .filter(generation_pageserver.eq(node_id.0 as i64))
                    .set(generation.eq(generation + 1))
@@ -365,7 +428,7 @@ impl Persistence {
    ) -> anyhow::Result<Generation> {
        use crate::schema::tenant_shards::dsl::*;
        let updated = self
-            .with_conn(move |conn| {
+            .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
                let updated = diesel::update(tenant_shards)
                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -409,7 +472,7 @@ impl Persistence {
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;

-        self.with_conn(move |conn| {
+        self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
            let query = diesel::update(tenant_shards)
                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -450,7 +513,7 @@ impl Persistence {
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;

-        self.with_conn(move |conn| {
+        self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| {
            diesel::update(tenant_shards)
                .filter(tenant_id.eq(input_tenant_id.to_string()))
                .set((config.eq(serde_json::to_string(&input_config).unwrap()),))
@@ -465,7 +528,7 @@ impl Persistence {

    pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| {
+        self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
            let updated = diesel::update(tenant_shards)
                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -495,7 +558,7 @@ impl Persistence {
        parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
            conn.transaction(|conn| -> DatabaseResult<()> {
                // Mark parent shards as splitting

@@ -559,26 +622,29 @@ impl Persistence {
        old_shard_count: ShardCount,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            conn.transaction(|conn| -> QueryResult<()> {
-                // Drop parent shards
-                diesel::delete(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.eq(old_shard_count.literal() as i32))
-                    .execute(conn)?;
+        self.with_measured_conn(
+            DatabaseOperation::CompleteShardSplit,
+            move |conn| -> DatabaseResult<()> {
+                conn.transaction(|conn| -> QueryResult<()> {
+                    // Drop parent shards
+                    diesel::delete(tenant_shards)
+                        .filter(tenant_id.eq(split_tenant_id.to_string()))
+                        .filter(shard_count.eq(old_shard_count.literal() as i32))
+                        .execute(conn)?;

-                // Clear sharding flag
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .set((splitting.eq(0),))
-                    .execute(conn)?;
-                debug_assert!(updated > 0);
+                    // Clear sharding flag
+                    let updated = diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(split_tenant_id.to_string()))
+                        .set((splitting.eq(0),))
+                        .execute(conn)?;
+                    debug_assert!(updated > 0);
+
+                    Ok(())
+                })?;

                Ok(())
-            })?;
-
-            Ok(())
-        })
+            },
+        )
        .await
    }

@@ -590,40 +656,44 @@ impl Persistence {
        new_shard_count: ShardCount,
    ) -> DatabaseResult<AbortShardSplitStatus> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<AbortShardSplitStatus> {
-            let aborted = conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
-                // Clear the splitting state on parent shards
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.ne(new_shard_count.literal() as i32))
-                    .set((splitting.eq(0),))
-                    .execute(conn)?;
+        self.with_measured_conn(
+            DatabaseOperation::AbortShardSplit,
+            move |conn| -> DatabaseResult<AbortShardSplitStatus> {
+                let aborted =
+                    conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
+                        // Clear the splitting state on parent shards
+                        let updated = diesel::update(tenant_shards)
+                            .filter(tenant_id.eq(split_tenant_id.to_string()))
+                            .filter(shard_count.ne(new_shard_count.literal() as i32))
+                            .set((splitting.eq(0),))
+                            .execute(conn)?;

-                // Parent shards are already gone: we cannot abort.
-                if updated == 0 {
-                    return Ok(AbortShardSplitStatus::Complete);
-                }
+                        // Parent shards are already gone: we cannot abort.
+                        if updated == 0 {
+                            return Ok(AbortShardSplitStatus::Complete);
+                        }

-                // Sanity check: if parent shards were present, their cardinality should
-                // be less than the number of child shards.
-                if updated >= new_shard_count.count() as usize {
-                    return Err(DatabaseError::Logical(format!(
-                        "Unexpected parent shard count {updated} while aborting split to \
+                        // Sanity check: if parent shards were present, their cardinality should
+                        // be less than the number of child shards.
+                        if updated >= new_shard_count.count() as usize {
+                            return Err(DatabaseError::Logical(format!(
+                                "Unexpected parent shard count {updated} while aborting split to \
                            count {new_shard_count:?} on tenant {split_tenant_id}"
-                    )));
-                }
+                            )));
+                        }

-                // Erase child shards
-                diesel::delete(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.eq(new_shard_count.literal() as i32))
-                    .execute(conn)?;
+                        // Erase child shards
+                        diesel::delete(tenant_shards)
+                            .filter(tenant_id.eq(split_tenant_id.to_string()))
+                            .filter(shard_count.eq(new_shard_count.literal() as i32))
+                            .execute(conn)?;

-                Ok(AbortShardSplitStatus::Aborted)
-            })?;
+                        Ok(AbortShardSplitStatus::Aborted)
+                    })?;

-            Ok(aborted)
-        })
+                Ok(aborted)
+            },
+        )
        .await
    }
 }
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,3 +1,4 @@
+use crate::pageserver_client::PageserverClient;
 use crate::persistence::Persistence;
 use crate::service;
 use hyper::StatusCode;
@@ -117,6 +118,15 @@ impl Reconciler {
        flush_ms: Option<Duration>,
        lazy: bool,
    ) -> Result<(), ReconcileError> {
+        if !node.is_available() && config.mode == LocationConfigMode::Detached {
+            // Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline
+            // will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of
+            // what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`]
+            tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation");
+            self.observed.locations.remove(&node.get_id());
+            return Ok(());
+        }
+
        self.observed
            .locations
            .insert(node.get_id(), ObservedStateLocation { conf: None });
@@ -149,9 +159,16 @@ impl Reconciler {
        };
        tracing::info!("location_config({node}) complete: {:?}", config);

-        self.observed
-            .locations
-            .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
+        match config.mode {
+            LocationConfigMode::Detached => {
+                self.observed.locations.remove(&node.get_id());
+            }
+            _ => {
+                self.observed
+                    .locations
+                    .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
+            }
+        }

        Ok(())
    }
@@ -243,8 +260,11 @@ impl Reconciler {
        tenant_shard_id: TenantShardId,
        node: &Node,
    ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
-        let client =
-            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
+        let client = PageserverClient::new(
+            node.get_id(),
+            node.base_url(),
+            self.service_config.jwt_token.as_deref(),
+        );

        let timelines = client.timeline_list(&tenant_shard_id).await?;
        Ok(timelines
@@ -475,7 +495,7 @@ impl Reconciler {
            }
        }

-        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Single, then
+        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Attached(0), then
        // this location will be deleted in the general case reconciliation that runs after this.
        let origin_secondary_conf = build_location_config(
            &self.shard,
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -7,7 +7,9 @@ use std::{
    time::{Duration, Instant},
 };

-use crate::{id_lock_map::IdLockMap, persistence::AbortShardSplitStatus};
+use crate::{
+    id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError,
+};
 use anyhow::Context;
 use control_plane::storage_controller::{
    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
@@ -18,12 +20,14 @@ use hyper::StatusCode;
 use pageserver_api::{
    controller_api::{
        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
-        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
-        TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
+        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
+        TenantDescribeResponseShard, TenantLocateResponse, TenantShardMigrateRequest,
+        TenantShardMigrateResponse, UtilizationScore,
    },
    models::{SecondaryProgress, TenantConfigRequest},
 };

+use crate::pageserver_client::PageserverClient;
 use pageserver_api::{
    models::{
        self, LocationConfig, LocationConfigListResponse, LocationConfigMode,
@@ -200,6 +204,30 @@ enum TenantCreateOrUpdate {
    Update(Vec<ShardUpdate>),
 }

+struct ShardSplitParams {
+    old_shard_count: ShardCount,
+    new_shard_count: ShardCount,
+    new_stripe_size: Option<ShardStripeSize>,
+    targets: Vec<ShardSplitTarget>,
+    policy: PlacementPolicy,
+    config: TenantConfig,
+    shard_ident: ShardIdentity,
+}
+
+// When preparing for a shard split, we may either choose to proceed with the split,
+// or find that the work is already done and return NoOp.
+enum ShardSplitAction {
+    Split(ShardSplitParams),
+    NoOp(TenantShardSplitResponse),
+}
+
+// A parent shard which will be split
+struct ShardSplitTarget {
+    parent_id: TenantShardId,
+    node: Node,
+    child_ids: Vec<TenantShardId>,
+}
+
 /// When we tenant shard split operation fails, we may not be able to clean up immediately, because nodes
 /// might not be available.  We therefore use a queue of abort operations processed in the background.
 struct TenantShardSplitAbort {
@@ -525,7 +553,11 @@ impl Service {
                break;
            }

-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            let client = PageserverClient::new(
+                node.get_id(),
+                node.base_url(),
+                self.config.jwt_token.as_deref(),
+            );
            match client
                .location_config(
                    tenant_shard_id,
@@ -733,7 +765,19 @@ impl Service {
                tenant.waiter.advance(result.sequence);
            }
            Err(e) => {
-                tracing::warn!("Reconcile error: {}", e);
+                match e {
+                    ReconcileError::Cancel => {
+                        tracing::info!("Reconciler was cancelled");
+                    }
+                    ReconcileError::Remote(mgmt_api::Error::Cancelled) => {
+                        // This might be due to the reconciler getting cancelled, or it might
+                        // be due to the `Node` being marked offline.
+                        tracing::info!("Reconciler cancelled during pageserver API call");
+                    }
+                    _ => {
+                        tracing::warn!("Reconcile error: {}", e);
+                    }
+                }

                // Ordering: populate last_error before advancing error_seq,
                // so that waiters will see the correct error after waiting.
@@ -1057,7 +1101,7 @@ impl Service {
                shard_stripe_size: 0,
                generation: Some(0),
                generation_pageserver: None,
-                placement_policy: serde_json::to_string(&PlacementPolicy::Single).unwrap(),
+                placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
                config: serde_json::to_string(&TenantConfig::default()).unwrap(),
                splitting: SplitState::default(),
            };
@@ -1084,7 +1128,7 @@ impl Service {
                        TenantState::new(
                            attach_req.tenant_shard_id,
                            ShardIdentity::unsharded(),
-                            PlacementPolicy::Single,
+                            PlacementPolicy::Attached(0),
                        ),
                    );
                    tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
@@ -1113,7 +1157,7 @@ impl Service {
                    self.persistence
                        .update_tenant_shard(
                            attach_req.tenant_shard_id,
-                            PlacementPolicy::Single,
+                            PlacementPolicy::Attached(0),
                            conf,
                            None,
                        )
@@ -1138,7 +1182,7 @@ impl Service {

        if let Some(new_generation) = new_generation {
            tenant_state.generation = Some(new_generation);
-            tenant_state.policy = PlacementPolicy::Single;
+            tenant_state.policy = PlacementPolicy::Attached(0);
        } else {
            // This is a detach notification.  We must update placement policy to avoid re-attaching
            // during background scheduling/reconciliation, or during storage controller restart.
@@ -1350,7 +1394,8 @@ impl Service {
            incremented_generations.len()
        );

-        // Apply the updated generation to our in-memory state
+        // Apply the updated generation to our in-memory state, and
+        // gather discover secondary locations.
        let mut locked = self.inner.write().unwrap();
        let (nodes, tenants, scheduler) = locked.parts_mut();

@@ -1358,62 +1403,65 @@ impl Service {
            tenants: Vec::new(),
        };

-        for (tenant_shard_id, new_gen) in incremented_generations {
-            response.tenants.push(ReAttachResponseTenant {
-                id: tenant_shard_id,
-                gen: new_gen.into().unwrap(),
-            });
-            // Apply the new generation number to our in-memory state
-            let shard_state = tenants.get_mut(&tenant_shard_id);
-            let Some(shard_state) = shard_state else {
-                // Not fatal.  This edge case requires a re-attach to happen
-                // between inserting a new tenant shard in to the database, and updating our in-memory
-                // state to know about the shard, _and_ that the state inserted to the database referenced
-                // a pageserver.  Should never happen, but handle it rather than panicking, since it should
-                // be harmless.
-                tracing::error!(
-                    "Shard {} is in database for node {} but not in-memory state",
-                    tenant_shard_id,
-                    reattach_req.node_id
-                );
-                continue;
-            };
+        // TODO: cancel/restart any running reconciliation for this tenant, it might be trying
+        // to call location_conf API with an old generation.  Wait for cancellation to complete
+        // before responding to this request.  Requires well implemented CancellationToken logic
+        // all the way to where we call location_conf.  Even then, there can still be a location_conf
+        // request in flight over the network: TODO handle that by making location_conf API refuse
+        // to go backward in generations.

-            // If [`Persistence::re_attach`] selected this shard, it must have alread
-            // had a generation set.
-            debug_assert!(shard_state.generation.is_some());
-            let Some(old_gen) = shard_state.generation else {
-                // Should never happen:  would only return incremented generation
-                // for a tenant that already had a non-null generation.
-                return Err(ApiError::InternalServerError(anyhow::anyhow!(
-                    "Generation must be set while re-attaching"
-                )));
-            };
-            shard_state.generation = Some(std::cmp::max(old_gen, new_gen));
-            if let Some(observed) = shard_state
-                .observed
-                .locations
-                .get_mut(&reattach_req.node_id)
-            {
-                if let Some(conf) = observed.conf.as_mut() {
-                    conf.generation = new_gen.into();
+        // Scan through all shards, applying updates for ones where we updated generation
+        // and identifying shards that intend to have a secondary location on this node.
+        for (tenant_shard_id, shard) in tenants {
+            if let Some(new_gen) = incremented_generations.get(tenant_shard_id) {
+                let new_gen = *new_gen;
+                response.tenants.push(ReAttachResponseTenant {
+                    id: *tenant_shard_id,
+                    gen: Some(new_gen.into().unwrap()),
+                    // A tenant is only put into multi or stale modes in the middle of a [`Reconciler::live_migrate`]
+                    // execution.  If a pageserver is restarted during that process, then the reconcile pass will
+                    // fail, and start from scratch, so it doesn't make sense for us to try and preserve
+                    // the stale/multi states at this point.
+                    mode: LocationConfigMode::AttachedSingle,
+                });
+
+                shard.generation = std::cmp::max(shard.generation, Some(new_gen));
+                if let Some(observed) = shard.observed.locations.get_mut(&reattach_req.node_id) {
+                    // Why can we update `observed` even though we're not sure our response will be received
+                    // by the pageserver?  Because the pageserver will not proceed with startup until
+                    // it has processed response: if it loses it, we'll see another request and increment
+                    // generation again, avoiding any uncertainty about dirtiness of tenant's state.
+                    if let Some(conf) = observed.conf.as_mut() {
+                        conf.generation = new_gen.into();
+                    }
+                } else {
+                    // This node has no observed state for the shard: perhaps it was offline
+                    // when the pageserver restarted.  Insert a None, so that the Reconciler
+                    // will be prompted to learn the location's state before it makes changes.
+                    shard
+                        .observed
+                        .locations
+                        .insert(reattach_req.node_id, ObservedStateLocation { conf: None });
                }
-            } else {
-                // This node has no observed state for the shard: perhaps it was offline
-                // when the pageserver restarted.  Insert a None, so that the Reconciler
-                // will be prompted to learn the location's state before it makes changes.
-                shard_state
-                    .observed
-                    .locations
-                    .insert(reattach_req.node_id, ObservedStateLocation { conf: None });
-            }
+            } else if shard.intent.get_secondary().contains(&reattach_req.node_id) {
+                // Ordering: pageserver will not accept /location_config requests until it has
+                // finished processing the response from re-attach.  So we can update our in-memory state
+                // now, and be confident that we are not stamping on the result of some later location config.
+                // TODO: however, we are not strictly ordered wrt ReconcileResults queue,
+                // so we might update observed state here, and then get over-written by some racing
+                // ReconcileResult.  The impact is low however, since we have set state on pageserver something
+                // that matches intent, so worst case if we race then we end up doing a spurious reconcile.

-            // TODO: cancel/restart any running reconciliation for this tenant, it might be trying
-            // to call location_conf API with an old generation.  Wait for cancellation to complete
-            // before responding to this request.  Requires well implemented CancellationToken logic
-            // all the way to where we call location_conf.  Even then, there can still be a location_conf
-            // request in flight over the network: TODO handle that by making location_conf API refuse
-            // to go backward in generations.
+                response.tenants.push(ReAttachResponseTenant {
+                    id: *tenant_shard_id,
+                    gen: None,
+                    mode: LocationConfigMode::Secondary,
+                });
+
+                // We must not update observed, because we have no guarantee that our
+                // response will be received by the pageserver. This could leave it
+                // falsely dirty, but the resulting reconcile should be idempotent.
+            }
        }

        // We consider a node Active once we have composed a re-attach response, but we
@@ -1491,11 +1539,11 @@ impl Service {
        &self,
        create_req: TenantCreateRequest,
    ) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
-        // As a default, single is convenient for tests that don't choose a policy.
        let placement_policy = create_req
            .placement_policy
            .clone()
-            .unwrap_or(PlacementPolicy::Single);
+            // As a default, zero secondaries is convenient for tests that don't choose a policy.
+            .unwrap_or(PlacementPolicy::Attached(0));

        // This service expects to handle sharding itself: it is an error to try and directly create
        // a particular shard here.
@@ -1705,11 +1753,11 @@ impl Service {
            | LocationConfigMode::AttachedSingle
            | LocationConfigMode::AttachedStale => {
                if nodes.len() > 1 {
-                    PlacementPolicy::Double(1)
+                    PlacementPolicy::Attached(1)
                } else {
                    // Convenience for dev/test: if we just have one pageserver, import
-                    // tenants into Single mode so that scheduling will succeed.
-                    PlacementPolicy::Single
+                    // tenants into non-HA mode so that scheduling will succeed.
+                    PlacementPolicy::Attached(0)
                }
            }
        };
@@ -2058,8 +2106,11 @@ impl Service {
                })
                .collect::<Vec<_>>();
            for tenant_shard_id in shard_ids {
-                let client =
-                    mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+                let client = PageserverClient::new(
+                    node.get_id(),
+                    node.base_url(),
+                    self.config.jwt_token.as_deref(),
+                );

                tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",);

@@ -2111,7 +2162,11 @@ impl Service {
        // Issue concurrent requests to all shards' locations
        let mut futs = FuturesUnordered::new();
        for (tenant_shard_id, node) in targets {
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            let client = PageserverClient::new(
+                node.get_id(),
+                node.base_url(),
+                self.config.jwt_token.as_deref(),
+            );
            futs.push(async move {
                let result = client
                    .tenant_secondary_download(tenant_shard_id, wait)
@@ -2204,7 +2259,11 @@ impl Service {
        // Phase 1: delete on the pageservers
        let mut any_pending = false;
        for (tenant_shard_id, node) in targets {
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            let client = PageserverClient::new(
+                node.get_id(),
+                node.base_url(),
+                self.config.jwt_token.as_deref(),
+            );
            // TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not
            // surface immediately as an error to our caller.
            let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| {
@@ -2316,7 +2375,7 @@ impl Service {
                tenant_shard_id,
                create_req.new_timeline_id,
            );
-            let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
+            let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());

            client
                .timeline_create(tenant_shard_id, &create_req)
@@ -2440,7 +2499,7 @@ impl Service {
                "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
            );

-            let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
+            let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
            client
                .timeline_delete(tenant_shard_id, timeline_id)
                .await
@@ -2481,11 +2540,11 @@ impl Service {
    }

    /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
-    /// function looks it up and returns the url.  If the tenant isn't found, returns Err(ApiError::NotFound)
-    pub(crate) fn tenant_shard0_baseurl(
+    /// function looks up and returns node. If the tenant isn't found, returns Err(ApiError::NotFound)
+    pub(crate) fn tenant_shard0_node(
        &self,
        tenant_id: TenantId,
-    ) -> Result<(String, TenantShardId), ApiError> {
+    ) -> Result<(Node, TenantShardId), ApiError> {
        let locked = self.inner.read().unwrap();
        let Some((tenant_shard_id, shard)) = locked
            .tenants
@@ -2517,7 +2576,7 @@ impl Service {
            )));
        };

-        Ok((node.base_url(), *tenant_shard_id))
+        Ok((node.clone(), *tenant_shard_id))
    }

    pub(crate) fn tenant_locate(
@@ -2527,9 +2586,6 @@ impl Service {
        let locked = self.inner.read().unwrap();
        tracing::info!("Locating shards for tenant {tenant_id}");

-        // Take a snapshot of pageservers
-        let pageservers = locked.nodes.clone();
-
        let mut result = Vec::new();
        let mut shard_params: Option<ShardParameters> = None;

@@ -2543,7 +2599,8 @@ impl Service {
                        "Cannot locate a tenant that is not attached"
                    )))?;

-            let node = pageservers
+            let node = locked
+                .nodes
                .get(&node_id)
                .expect("Pageservers may not be deleted while referenced");

@@ -2591,6 +2648,47 @@ impl Service {
        })
    }

+    pub(crate) fn tenant_describe(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<TenantDescribeResponse, ApiError> {
+        let locked = self.inner.read().unwrap();
+
+        let mut shard_zero = None;
+        let mut shards = Vec::new();
+
+        for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+        {
+            if tenant_shard_id.is_zero() {
+                shard_zero = Some(shard);
+            }
+
+            let response_shard = TenantDescribeResponseShard {
+                tenant_shard_id: *tenant_shard_id,
+                node_attached: *shard.intent.get_attached(),
+                node_secondary: shard.intent.get_secondary().to_vec(),
+                last_error: shard.last_error.lock().unwrap().clone(),
+                is_reconciling: shard.reconciler.is_some(),
+                is_pending_compute_notification: shard.pending_compute_notification,
+                is_splitting: matches!(shard.splitting, SplitState::Splitting),
+            };
+            shards.push(response_shard);
+        }
+
+        let Some(shard_zero) = shard_zero else {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant {tenant_id} not found").into(),
+            ));
+        };
+
+        Ok(TenantDescribeResponse {
+            shards,
+            stripe_size: shard_zero.shard.stripe_size,
+            policy: shard_zero.policy.clone(),
+            config: shard_zero.config.clone(),
+        })
+    }
+
    #[instrument(skip_all, fields(tenant_id=%op.tenant_id))]
    async fn abort_tenant_shard_split(
        &self,
@@ -2648,7 +2746,7 @@ impl Service {
        let detach_locations: Vec<(Node, TenantShardId)> = {
            let mut detach_locations = Vec::new();
            let mut locked = self.inner.write().unwrap();
-            let (nodes, tenants, _scheduler) = locked.parts_mut();
+            let (nodes, tenants, scheduler) = locked.parts_mut();

            for (tenant_shard_id, shard) in
                tenants.range_mut(TenantShardId::tenant_range(op.tenant_id))
@@ -2681,6 +2779,13 @@ impl Service {

                tracing::info!("Restoring parent shard {tenant_shard_id}");
                shard.splitting = SplitState::Idle;
+                if let Err(e) = shard.schedule(scheduler) {
+                    // If this shard can't be scheduled now (perhaps due to offline nodes or
+                    // capacity issues), that must not prevent us rolling back a split.  In this
+                    // case it should be eventually scheduled in the background.
+                    tracing::warn!("Failed to schedule {tenant_shard_id} during shard abort: {e}")
+                }
+
                self.maybe_reconcile_shard(shard, nodes);
            }

@@ -2772,7 +2877,7 @@ impl Service {
                .map(|(shard_id, _)| *shard_id)
                .collect::<Vec<_>>();

-            let (_nodes, tenants, scheduler) = locked.parts_mut();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
            for parent_id in parent_ids {
                let child_ids = parent_id.split(new_shard_count);

@@ -2814,7 +2919,7 @@ impl Service {
                                generation,
                                &child_shard,
                                &config,
-                                matches!(policy, PlacementPolicy::Double(n) if n > 0),
+                                matches!(policy, PlacementPolicy::Attached(n) if n > 0),
                            )),
                        },
                    );
@@ -2839,6 +2944,8 @@ impl Service {
                        // find a secondary (e.g. because cluster is overloaded).
                        tracing::warn!("Failed to schedule child shard {child}: {e}");
                    }
+                    // In the background, attach secondary locations for the new shards
+                    self.maybe_reconcile_shard(&mut child_state, nodes);

                    tenants.insert(child, child_state);
                    response.new_shards.push(child);
@@ -2861,17 +2968,23 @@ impl Service {
        let new_shard_count = ShardCount::new(split_req.new_shard_count);
        let new_stripe_size = split_req.new_stripe_size;

-        let r = self.do_tenant_shard_split(tenant_id, split_req).await;
+        // Validate the request and construct parameters.  This phase is fallible, but does not require
+        // rollback on errors, as it does no I/O and mutates no state.
+        let shard_split_params = match self.prepare_tenant_shard_split(tenant_id, split_req)? {
+            ShardSplitAction::NoOp(resp) => return Ok(resp),
+            ShardSplitAction::Split(params) => params,
+        };
+
+        // Execute this split: this phase mutates state and does remote I/O on pageservers.  If it fails,
+        // we must roll back.
+        let r = self
+            .do_tenant_shard_split(tenant_id, shard_split_params)
+            .await;

        match r {
            Ok(r) => Ok(r),
-            Err(ApiError::BadRequest(_)) => {
-                // A request validation error does not require rollback: we rejected it before we started making any changes: just
-                // return the error
-                r
-            }
            Err(e) => {
-                // General case error handling: split might be part-done, we must do work to abort it.
+                // Split might be part-done, we must do work to abort it.
                tracing::warn!("Enqueuing background abort of split on {tenant_id}");
                self.abort_tx
                    .send(TenantShardSplitAbort {
@@ -2887,25 +3000,18 @@ impl Service {
        }
    }

-    pub(crate) async fn do_tenant_shard_split(
+    fn prepare_tenant_shard_split(
        &self,
        tenant_id: TenantId,
        split_req: TenantShardSplitRequest,
-    ) -> Result<TenantShardSplitResponse, ApiError> {
-        let mut policy = None;
-        let mut shard_ident = None;
-
-        // A parent shard which will be split
-        struct SplitTarget {
-            parent_id: TenantShardId,
-            node: Node,
-            child_ids: Vec<TenantShardId>,
-        }
-
+    ) -> Result<ShardSplitAction, ApiError> {
        fail::fail_point!("shard-split-validation", |_| Err(ApiError::BadRequest(
            anyhow::anyhow!("failpoint")
        )));

+        let mut policy = None;
+        let mut config = None;
+        let mut shard_ident = None;
        // Validate input, and calculate which shards we will create
        let (old_shard_count, targets) =
            {
@@ -2961,6 +3067,9 @@ impl Service {
                    if shard_ident.is_none() {
                        shard_ident = Some(shard.shard);
                    }
+                    if config.is_none() {
+                        config = Some(shard.config.clone());
+                    }

                    if tenant_shard_id.shard_count.count() == split_req.new_shard_count {
                        tracing::info!(
@@ -2979,9 +3088,7 @@ impl Service {
                        .get(&node_id)
                        .expect("Pageservers may not be deleted while referenced");

-                    // TODO: if any reconciliation is currently in progress for this shard, wait for it.
-
-                    targets.push(SplitTarget {
+                    targets.push(ShardSplitTarget {
                        parent_id: *tenant_shard_id,
                        node: node.clone(),
                        child_ids: tenant_shard_id
@@ -2991,9 +3098,9 @@ impl Service {

                if targets.is_empty() {
                    if children_found.len() == split_req.new_shard_count as usize {
-                        return Ok(TenantShardSplitResponse {
+                        return Ok(ShardSplitAction::NoOp(TenantShardSplitResponse {
                            new_shards: children_found,
-                        });
+                        }));
                    } else {
                        // No shards found to split, and no existing children found: the
                        // tenant doesn't exist at all.
@@ -3023,13 +3130,77 @@ impl Service {
            shard_ident.unwrap()
        };
        let policy = policy.unwrap();
+        let config = config.unwrap();

+        Ok(ShardSplitAction::Split(ShardSplitParams {
+            old_shard_count,
+            new_shard_count: ShardCount::new(split_req.new_shard_count),
+            new_stripe_size: split_req.new_stripe_size,
+            targets,
+            policy,
+            config,
+            shard_ident,
+        }))
+    }
+
+    async fn do_tenant_shard_split(
+        &self,
+        tenant_id: TenantId,
+        params: ShardSplitParams,
+    ) -> Result<TenantShardSplitResponse, ApiError> {
        // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
        // request could occur here, deleting or mutating the tenant.  begin_shard_split checks that the
        // parent shards exist as expected, but it would be neater to do the above pre-checks within the
        // same database transaction rather than pre-check in-memory and then maybe-fail the database write.
        // (https://github.com/neondatabase/neon/issues/6676)

+        let ShardSplitParams {
+            old_shard_count,
+            new_shard_count,
+            new_stripe_size,
+            mut targets,
+            policy,
+            config,
+            shard_ident,
+        } = params;
+
+        // Drop any secondary locations: pageservers do not support splitting these, and in any case the
+        // end-state for a split tenant will usually be to have secondary locations on different nodes.
+        // The reconciliation calls in this block also implicitly cancel+barrier wrt any ongoing reconciliation
+        // at the time of split.
+        let waiters = {
+            let mut locked = self.inner.write().unwrap();
+            let mut waiters = Vec::new();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
+            for target in &mut targets {
+                let Some(shard) = tenants.get_mut(&target.parent_id) else {
+                    // Paranoia check: this shouldn't happen: we have the oplock for this tenant ID.
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                        "Shard {} not found",
+                        target.parent_id
+                    )));
+                };
+
+                if shard.intent.get_attached() != &Some(target.node.get_id()) {
+                    // Paranoia check: this shouldn't happen: we have the oplock for this tenant ID.
+                    return Err(ApiError::Conflict(format!(
+                        "Shard {} unexpectedly rescheduled during split",
+                        target.parent_id
+                    )));
+                }
+
+                // Irrespective of PlacementPolicy, clear secondary locations from intent
+                shard.intent.clear_secondary(scheduler);
+
+                // Run Reconciler to execute detach fo secondary locations.
+                if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
+                    waiters.push(waiter);
+                }
+            }
+            waiters
+        };
+        self.await_waiters(waiters, RECONCILE_TIMEOUT).await?;
+
        // Before creating any new child shards in memory or on the pageservers, persist them: this
        // enables us to ensure that we will always be able to clean up if something goes wrong.  This also
        // acts as the protection against two concurrent attempts to split: one of them will get a database
@@ -3058,8 +3229,7 @@ impl Service {
                    generation: None,
                    generation_pageserver: Some(target.node.get_id().0 as i64),
                    placement_policy: serde_json::to_string(&policy).unwrap(),
-                    // TODO: get the config out of the map
-                    config: serde_json::to_string(&TenantConfig::default()).unwrap(),
+                    config: serde_json::to_string(&config).unwrap(),
                    splitting: SplitState::Splitting,
                });
            }
@@ -3111,18 +3281,22 @@ impl Service {
        // N>1 shards into M shards -- initially we're usually splitting 1 shard into N).

        for target in &targets {
-            let SplitTarget {
+            let ShardSplitTarget {
                parent_id,
                node,
                child_ids,
            } = target;
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            let client = PageserverClient::new(
+                node.get_id(),
+                node.base_url(),
+                self.config.jwt_token.as_deref(),
+            );
            let response = client
                .tenant_shard_split(
                    *parent_id,
                    TenantShardSplitRequest {
-                        new_shard_count: split_req.new_shard_count,
-                        new_stripe_size: split_req.new_stripe_size,
+                        new_shard_count: new_shard_count.literal(),
+                        new_stripe_size,
                    },
                )
                .await
@@ -3171,11 +3345,8 @@ impl Service {
        ));

        // Replace all the shards we just split with their children: this phase is infallible.
-        let (response, child_locations) = self.tenant_shard_split_commit_inmem(
-            tenant_id,
-            ShardCount::new(split_req.new_shard_count),
-            split_req.new_stripe_size,
-        );
+        let (response, child_locations) =
+            self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size);

        // Send compute notifications for all the new shards
        let mut failed_notifications = Vec::new();
@@ -3240,17 +3411,20 @@ impl Service {
                let old_attached = *shard.intent.get_attached();

                match shard.policy {
-                    PlacementPolicy::Single => {
-                        shard.intent.clear_secondary(scheduler);
-                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
-                    }
-                    PlacementPolicy::Double(_n) => {
+                    PlacementPolicy::Attached(n) => {
                        // If our new attached node was a secondary, it no longer should be.
                        shard.intent.remove_secondary(scheduler, migrate_req.node_id);

                        // If we were already attached to something, demote that to a secondary
                        if let Some(old_attached) = old_attached {
-                            shard.intent.push_secondary(scheduler, old_attached);
+                            if n > 0 {
+                                // Remove other secondaries to make room for the location we'll demote
+                                while shard.intent.get_secondary().len() >= n {
+                                    shard.intent.pop_secondary(scheduler);
+                                }
+
+                                shard.intent.push_secondary(scheduler, old_attached);
+                            }
                        }

                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
@@ -3276,7 +3450,7 @@ impl Service {
        if let Some(waiter) = waiter {
            waiter.wait_timeout(RECONCILE_TIMEOUT).await?;
        } else {
-            tracing::warn!("Migration is a no-op");
+            tracing::info!("Migration is a no-op");
        }

        Ok(TenantShardMigrateResponse {})
@@ -3631,6 +3805,13 @@ impl Service {
                        observed_loc.conf = None;
                    }

+                    if new_nodes.len() == 1 {
+                        // Special case for single-node cluster: there is no point trying to reschedule
+                        // any tenant shards: avoid doing so, in order to avoid spewing warnings about
+                        // failures to schedule them.
+                        continue;
+                    }
+
                    if tenant_state.intent.demote_attached(node_id) {
                        tenant_state.sequence = tenant_state.sequence.next();
                        match tenant_state.schedule(scheduler) {
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -4,7 +4,10 @@ use std::{
    time::Duration,
 };

-use crate::{metrics, persistence::TenantShardPersistence};
+use crate::{
+    metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
+    persistence::TenantShardPersistence,
+};
 use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::{
    models::{LocationConfig, LocationConfigMode, TenantConfig},
@@ -457,22 +460,7 @@ impl TenantState {
        // Add/remove nodes to fulfil policy
        use PlacementPolicy::*;
        match self.policy {
-            Single => {
-                // Should have exactly one attached, and zero secondaries
-                if !self.intent.secondary.is_empty() {
-                    self.intent.clear_secondary(scheduler);
-                    modified = true;
-                }
-
-                let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
-                modified |= modified_attached;
-
-                if !self.intent.secondary.is_empty() {
-                    self.intent.clear_secondary(scheduler);
-                    modified = true;
-                }
-            }
-            Double(secondary_count) => {
+            Attached(secondary_count) => {
                let retain_secondaries = if self.intent.attached.is_none()
                    && scheduler.node_preferred(&self.intent.secondary).is_some()
                {
@@ -733,7 +721,10 @@ impl TenantState {
        let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
                                                        tenant_id=%reconciler.tenant_shard_id.tenant_id,
                                                        shard_id=%reconciler.tenant_shard_id.shard_slug());
-        metrics::RECONCILER.spawned.inc();
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_reconcile_spawn
+            .inc();
        let result_tx = result_tx.clone();
        let join_handle = tokio::task::spawn(
            async move {
@@ -751,10 +742,12 @@ impl TenantState {
                // TODO: wrap all remote API operations in cancellation check
                // as well.
                if reconciler.cancel.is_cancelled() {
-                    metrics::RECONCILER
-                        .complete
-                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
-                        .inc();
+                    metrics::METRICS_REGISTRY
+                        .metrics_group
+                        .storage_controller_reconcile_complete
+                        .inc(ReconcileCompleteLabelGroup {
+                            status: ReconcileOutcome::Cancel,
+                        });
                    return;
                }

@@ -769,18 +762,18 @@ impl TenantState {
                }

                // Update result counter
-                match &result {
-                    Ok(_) => metrics::RECONCILER
-                        .complete
-                        .with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
-                    Err(ReconcileError::Cancel) => metrics::RECONCILER
-                        .complete
-                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
-                    Err(_) => metrics::RECONCILER
-                        .complete
-                        .with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
-                }
-                .inc();
+                let outcome_label = match &result {
+                    Ok(_) => ReconcileOutcome::Success,
+                    Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel,
+                    Err(_) => ReconcileOutcome::Error,
+                };
+
+                metrics::METRICS_REGISTRY
+                    .metrics_group
+                    .storage_controller_reconcile_complete
+                    .inc(ReconcileCompleteLabelGroup {
+                        status: outcome_label,
+                    });

                result_tx
                    .send(ReconcileResult {
@@ -895,7 +888,7 @@ pub(crate) mod tests {

        let mut scheduler = Scheduler::new(nodes.values());

-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
        tenant_state
            .schedule(&mut scheduler)
            .expect("we have enough nodes, scheduling should work");
@@ -943,7 +936,7 @@ pub(crate) mod tests {
        let nodes = make_test_nodes(3);
        let mut scheduler = Scheduler::new(nodes.values());

-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));

        tenant_state.observed.locations.insert(
            NodeId(3),
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -437,7 +437,7 @@ async fn handle_tenant(

            let placement_policy = match create_match.get_one::<String>("placement-policy") {
                Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
-                _ => PlacementPolicy::Single,
+                _ => PlacementPolicy::Attached(0),
            };

            let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
@@ -523,88 +523,6 @@ async fn handle_tenant(
                .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
            println!("tenant {tenant_id} successfully configured on the pageserver");
        }
-        Some(("migrate", matches)) => {
-            let tenant_shard_id = get_tenant_shard_id(matches, env)?;
-            let new_pageserver = get_pageserver(env, matches)?;
-            let new_pageserver_id = new_pageserver.conf.id;
-
-            let storage_controller = StorageController::from_env(env);
-            storage_controller
-                .tenant_migrate(tenant_shard_id, new_pageserver_id)
-                .await?;
-
-            println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
-        }
-        Some(("status", matches)) => {
-            let tenant_id = get_tenant_id(matches, env)?;
-
-            let mut shard_table = comfy_table::Table::new();
-            shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
-
-            let mut tenant_synthetic_size = None;
-
-            let storage_controller = StorageController::from_env(env);
-            for shard in storage_controller.tenant_locate(tenant_id).await?.shards {
-                let pageserver =
-                    PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
-
-                let size = pageserver
-                    .http_client
-                    .tenant_details(shard.shard_id)
-                    .await?
-                    .tenant_info
-                    .current_physical_size
-                    .unwrap();
-
-                shard_table.add_row([
-                    format!("{}", shard.shard_id.shard_slug()),
-                    format!("{}", shard.node_id.0),
-                    format!("{} MiB", size / (1024 * 1024)),
-                ]);
-
-                if shard.shard_id.is_zero() {
-                    tenant_synthetic_size =
-                        Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
-                }
-            }
-
-            let Some(synthetic_size) = tenant_synthetic_size else {
-                bail!("Shard 0 not found")
-            };
-
-            let mut tenant_table = comfy_table::Table::new();
-            tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
-            tenant_table.add_row([
-                "Synthetic size".to_string(),
-                format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
-            ]);
-
-            println!("{tenant_table}");
-            println!("{shard_table}");
-        }
-        Some(("shard-split", matches)) => {
-            let tenant_id = get_tenant_id(matches, env)?;
-            let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
-            let shard_stripe_size: Option<ShardStripeSize> = matches
-                .get_one::<Option<ShardStripeSize>>("shard-stripe-size")
-                .cloned()
-                .unwrap();
-
-            let storage_controller = StorageController::from_env(env);
-            let result = storage_controller
-                .tenant_split(tenant_id, shard_count, shard_stripe_size)
-                .await?;
-            println!(
-                "Split tenant {} into shards {}",
-                tenant_id,
-                result
-                    .new_shards
-                    .iter()
-                    .map(|s| format!("{:?}", s))
-                    .collect::<Vec<_>>()
-                    .join(",")
-            );
-        }

        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
        None => bail!("no tenant subcommand provided"),
@@ -1578,19 +1496,6 @@ fn cli() -> Command {
            .subcommand(Command::new("config")
                .arg(tenant_id_arg.clone())
                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
-            .subcommand(Command::new("migrate")
-                .about("Migrate a tenant from one pageserver to another")
-                .arg(tenant_id_arg.clone())
-                .arg(pageserver_id_arg.clone()))
-            .subcommand(Command::new("status")
-                .about("Human readable summary of the tenant's shards and attachment locations")
-                .arg(tenant_id_arg.clone()))
-            .subcommand(Command::new("shard-split")
-                .about("Increase the number of shards in the tenant")
-                .arg(tenant_id_arg.clone())
-                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
-                .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
-                )
        )
        .subcommand(
            Command::new("pageserver")
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -127,8 +127,8 @@ pub struct PageServerConf {
    pub pg_auth_type: AuthType,
    pub http_auth_type: AuthType,

-    pub(crate) virtual_file_io_engine: String,
-    pub(crate) get_vectored_impl: String,
+    pub(crate) virtual_file_io_engine: Option<String>,
+    pub(crate) get_vectored_impl: Option<String>,
 }

 impl Default for PageServerConf {
@@ -139,9 +139,8 @@ impl Default for PageServerConf {
            listen_http_addr: String::new(),
            pg_auth_type: AuthType::Trust,
            http_auth_type: AuthType::Trust,
-            // FIXME: use the ones exposed by pageserver crate
-            virtual_file_io_engine: "tokio-epoll-uring".to_owned(),
-            get_vectored_impl: "sequential".to_owned(),
+            virtual_file_io_engine: None,
+            get_vectored_impl: None,
        }
    }
 }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -101,8 +101,16 @@ impl PageServerNode {

        let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
        let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
-        let virtual_file_io_engine = format!("virtual_file_io_engine='{virtual_file_io_engine}'");
-        let get_vectored_impl = format!("get_vectored_impl='{get_vectored_impl}'");
+        let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
+            format!("virtual_file_io_engine='{virtual_file_io_engine}'")
+        } else {
+            String::new()
+        };
+        let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
+            format!("get_vectored_impl='{get_vectored_impl}'")
+        } else {
+            String::new()
+        };

        let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());

--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -475,7 +475,7 @@ impl StorageController {
    pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
        self.dispatch::<(), _>(
            Method::GET,
-            format!("control/v1/tenant/{tenant_id}/locate"),
+            format!("debug/v1/tenant/{tenant_id}/locate"),
            None,
        )
        .await
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -6,7 +6,10 @@ use std::str::FromStr;
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;

-use crate::{models::ShardParameters, shard::TenantShardId};
+use crate::{
+    models::{ShardParameters, TenantConfig},
+    shard::{ShardStripeSize, TenantShardId},
+};

 #[derive(Serialize, Deserialize)]
 pub struct TenantCreateResponseShard {
@@ -57,6 +60,31 @@ pub struct TenantLocateResponse {
    pub shard_params: ShardParameters,
 }

+#[derive(Serialize, Deserialize)]
+pub struct TenantDescribeResponse {
+    pub shards: Vec<TenantDescribeResponseShard>,
+    pub stripe_size: ShardStripeSize,
+    pub policy: PlacementPolicy,
+    pub config: TenantConfig,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantDescribeResponseShard {
+    pub tenant_shard_id: TenantShardId,
+
+    pub node_attached: Option<NodeId>,
+    pub node_secondary: Vec<NodeId>,
+
+    pub last_error: String,
+
+    /// A task is currently running to reconcile this tenant's intent state with the state on pageservers
+    pub is_reconciling: bool,
+    /// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending.
+    pub is_pending_compute_notification: bool,
+    /// A shard split is currently underway
+    pub is_splitting: bool,
+}
+
 /// Explicitly migrating a particular shard is a low level operation
 /// TODO: higher level "Reschedule tenant" operation where the request
 /// specifies some constraints, e.g. asking it to get off particular node(s)
@@ -181,11 +209,8 @@ impl From<NodeSchedulingPolicy> for String {
 /// to create secondary locations.
 #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
 pub enum PlacementPolicy {
-    /// Cheapest way to attach a tenant: just one pageserver, no secondary
-    Single,
-    /// Production-ready way to attach a tenant: one attached pageserver and
-    /// some number of secondaries.
-    Double(usize),
+    /// Normal live state: one attached pageserver and zero or more secondaries.
+    Attached(usize),
    /// Create one secondary mode locations. This is useful when onboarding
    /// a tenant, or for an idle tenant that we might want to bring online quickly.
    Secondary,
@@ -207,14 +232,14 @@ mod test {
    /// Check stability of PlacementPolicy's serialization
    #[test]
    fn placement_policy_encoding() -> anyhow::Result<()> {
-        let v = PlacementPolicy::Double(1);
+        let v = PlacementPolicy::Attached(1);
        let encoded = serde_json::to_string(&v)?;
-        assert_eq!(encoded, "{\"Double\":1}");
+        assert_eq!(encoded, "{\"Attached\":1}");
        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);

-        let v = PlacementPolicy::Single;
+        let v = PlacementPolicy::Detached;
        let encoded = serde_json::to_string(&v)?;
-        assert_eq!(encoded, "\"Single\"");
+        assert_eq!(encoded, "\"Detached\"");
        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
        Ok(())
    }
--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -6,7 +6,9 @@
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;

-use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId};
+use crate::{
+    controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId,
+};

 /// Upcall message sent by the pageserver to the configured `control_plane_api` on
 /// startup.
@@ -20,12 +22,20 @@ pub struct ReAttachRequest {
    pub register: Option<NodeRegisterRequest>,
 }

-#[derive(Serialize, Deserialize)]
-pub struct ReAttachResponseTenant {
-    pub id: TenantShardId,
-    pub gen: u32,
+fn default_mode() -> LocationConfigMode {
+    LocationConfigMode::AttachedSingle
 }

+#[derive(Serialize, Deserialize, Debug)]
+pub struct ReAttachResponseTenant {
+    pub id: TenantShardId,
+    /// Mandatory if LocationConfigMode is None or set to an Attached* mode
+    pub gen: Option<u32>,
+
+    /// Default value only for backward compat: this field should be set
+    #[serde(default = "default_mode")]
+    pub mode: LocationConfigMode,
+}
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponse {
    pub tenants: Vec<ReAttachResponseTenant>,
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -198,6 +198,7 @@ impl LocalFs {
            fs::OpenOptions::new()
                .write(true)
                .create(true)
+                .truncate(true)
                .open(&temp_file_path)
                .await
                .with_context(|| {
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -13,6 +13,7 @@ testing = ["fail/failpoints"]
 [dependencies]
 arc-swap.workspace = true
 sentry.workspace = true
+async-compression.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
 bincode.workspace = true
@@ -36,6 +37,7 @@ serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
+tokio-tar.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
@@ -46,6 +48,7 @@ strum.workspace = true
 strum_macros.workspace = true
 url.workspace = true
 uuid.workspace = true
+walkdir.workspace = true

 pq_proto.workspace = true
 postgres_connection.workspace = true
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -245,7 +245,7 @@ impl std::io::Write for ChannelWriter {
    }
 }

-async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    SERVE_METRICS_COUNT.inc();

    let started_at = std::time::Instant::now();
@@ -367,7 +367,6 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
        .middleware(Middleware::post_with_info(
            add_request_id_header_to_response,
        ))
-        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
        .err_handler(route_error_handler)
 }

--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -87,6 +87,8 @@ pub mod failpoint_support;

 pub mod yielding_loop;

+pub mod zstd;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -63,6 +63,7 @@ impl UnwrittenLockFile {
 pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLockFile> {
    let lock_file = fs::OpenOptions::new()
        .create(true) // O_CREAT
+        .truncate(true)
        .write(true)
        .open(lock_file_path)
        .context("open lock file")?;
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -29,12 +29,10 @@ pub struct PageserverFeedback {
    // Serialize with RFC3339 format.
    #[serde(with = "serde_systemtime")]
    pub replytime: SystemTime,
+    /// Used to track feedbacks from different shards. Always zero for unsharded tenants.
+    pub shard_number: u32,
 }

-// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
-// Do not remove previously available fields because this might be backwards incompatible.
-pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
-
 impl PageserverFeedback {
    pub fn empty() -> PageserverFeedback {
        PageserverFeedback {
@@ -43,6 +41,7 @@ impl PageserverFeedback {
            remote_consistent_lsn: Lsn::INVALID,
            disk_consistent_lsn: Lsn::INVALID,
            replytime: *PG_EPOCH,
+            shard_number: 0,
        }
    }

@@ -59,17 +58,26 @@ impl PageserverFeedback {
    //
    // TODO: change serialized fields names once all computes migrate to rename.
    pub fn serialize(&self, buf: &mut BytesMut) {
-        buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
+        let buf_ptr = buf.len();
+        buf.put_u8(0); // # of keys, will be filled later
+        let mut nkeys = 0;
+
+        nkeys += 1;
        buf.put_slice(b"current_timeline_size\0");
        buf.put_i32(8);
        buf.put_u64(self.current_timeline_size);

+        nkeys += 1;
        buf.put_slice(b"ps_writelsn\0");
        buf.put_i32(8);
        buf.put_u64(self.last_received_lsn.0);
+
+        nkeys += 1;
        buf.put_slice(b"ps_flushlsn\0");
        buf.put_i32(8);
        buf.put_u64(self.disk_consistent_lsn.0);
+
+        nkeys += 1;
        buf.put_slice(b"ps_applylsn\0");
        buf.put_i32(8);
        buf.put_u64(self.remote_consistent_lsn.0);
@@ -80,9 +88,19 @@ impl PageserverFeedback {
            .expect("failed to serialize pg_replytime earlier than PG_EPOCH")
            .as_micros() as i64;

+        nkeys += 1;
        buf.put_slice(b"ps_replytime\0");
        buf.put_i32(8);
        buf.put_i64(timestamp);
+
+        if self.shard_number > 0 {
+            nkeys += 1;
+            buf.put_slice(b"shard_number\0");
+            buf.put_i32(4);
+            buf.put_u32(self.shard_number);
+        }
+
+        buf[buf_ptr] = nkeys;
    }

    // Deserialize PageserverFeedback message
@@ -125,9 +143,8 @@ impl PageserverFeedback {
                }
                b"shard_number" => {
                    let len = buf.get_i32();
-                    // TODO: this will be implemented in the next update,
-                    //  for now, we just skip the value.
-                    buf.advance(len as usize);
+                    assert_eq!(len, 4);
+                    rf.shard_number = buf.get_u32();
                }
                _ => {
                    let len = buf.get_i32();
@@ -200,10 +217,7 @@ mod tests {
        rf.serialize(&mut data);

        // Add an extra field to the buffer and adjust number of keys
-        if let Some(first) = data.first_mut() {
-            *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
-        }
-
+        data[0] += 1;
        data.put_slice(b"new_field_one\0");
        data.put_i32(8);
        data.put_u64(42);
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -245,7 +245,7 @@ impl<'a, T> Guard<'a, T> {
    ///
    /// The permit will be on a semaphore part of the new internal value, and any following
    /// [`OnceCell::get_or_init`] will wait on it to complete.
-    pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
+    pub fn take_and_deinit(mut self) -> (T, InitPermit) {
        let mut swapped = Inner::default();
        let sem = swapped.init_semaphore.clone();
        // acquire and forget right away, moving the control over to InitPermit
@@ -543,7 +543,7 @@ mod tests {
        target.set(42, permit);

        let (_answer, permit) = {
-            let mut guard = target
+            let guard = target
                .get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
                .await
                .unwrap();
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -1,27 +1,60 @@
 use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};

+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum VecMapOrdering {
+    Greater,
+    GreaterOrEqual,
+}
+
 /// Ordered map datastructure implemented in a Vec.
 /// Append only - can only add keys that are larger than the
 /// current max key.
+/// Ordering can be adjusted using [`VecMapOrdering`]
+/// during `VecMap` construction.
 #[derive(Clone, Debug)]
-pub struct VecMap<K, V>(Vec<(K, V)>);
+pub struct VecMap<K, V> {
+    data: Vec<(K, V)>,
+    ordering: VecMapOrdering,
+}

 impl<K, V> Default for VecMap<K, V> {
    fn default() -> Self {
-        VecMap(Default::default())
+        VecMap {
+            data: Default::default(),
+            ordering: VecMapOrdering::Greater,
+        }
    }
 }

-#[derive(Debug)]
-pub struct InvalidKey;
+#[derive(thiserror::Error, Debug)]
+pub enum VecMapError {
+    #[error("Key violates ordering constraint")]
+    InvalidKey,
+    #[error("Mismatched ordering constraints")]
+    ExtendOrderingError,
+}

 impl<K: Ord, V> VecMap<K, V> {
+    pub fn new(ordering: VecMapOrdering) -> Self {
+        Self {
+            data: Vec::new(),
+            ordering,
+        }
+    }
+
+    pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self {
+        Self {
+            data: Vec::with_capacity(capacity),
+            ordering,
+        }
+    }
+
    pub fn is_empty(&self) -> bool {
-        self.0.is_empty()
+        self.data.is_empty()
    }

    pub fn as_slice(&self) -> &[(K, V)] {
-        self.0.as_slice()
+        self.data.as_slice()
    }

    /// This function may panic if given a range where the lower bound is
@@ -29,7 +62,7 @@ impl<K: Ord, V> VecMap<K, V> {
    pub fn slice_range<R: RangeBounds<K>>(&self, range: R) -> &[(K, V)] {
        use std::ops::Bound::*;

-        let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key);
+        let binary_search = |k: &K| self.data.binary_search_by_key(&k, extract_key);

        let start_idx = match range.start_bound() {
            Unbounded => 0,
@@ -41,7 +74,7 @@ impl<K: Ord, V> VecMap<K, V> {
        };

        let end_idx = match range.end_bound() {
-            Unbounded => self.0.len(),
+            Unbounded => self.data.len(),
            Included(k) => match binary_search(k) {
                Ok(idx) => idx + 1,
                Err(idx) => idx,
@@ -49,34 +82,30 @@ impl<K: Ord, V> VecMap<K, V> {
            Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity),
        };

-        &self.0[start_idx..end_idx]
+        &self.data[start_idx..end_idx]
    }

    /// Add a key value pair to the map.
-    /// If `key` is less than or equal to the current maximum key
-    /// the pair will not be added and InvalidKey error will be returned.
-    pub fn append(&mut self, key: K, value: V) -> Result<usize, InvalidKey> {
-        if let Some((last_key, _last_value)) = self.0.last() {
-            if &key <= last_key {
-                return Err(InvalidKey);
-            }
-        }
+    /// If `key` is not respective of the `self` ordering the
+    /// pair will not be added and `InvalidKey` error will be returned.
+    pub fn append(&mut self, key: K, value: V) -> Result<usize, VecMapError> {
+        self.validate_key_order(&key)?;

        let delta_size = self.instrument_vec_op(|vec| vec.push((key, value)));
        Ok(delta_size)
    }

    /// Update the maximum key value pair or add a new key value pair to the map.
-    /// If `key` is less than the current maximum key no updates or additions
-    /// will occur and InvalidKey error will be returned.
+    /// If `key` is not respective of the `self` ordering no updates or additions
+    /// will occur and `InvalidKey` error will be returned.
    pub fn append_or_update_last(
        &mut self,
        key: K,
        mut value: V,
-    ) -> Result<(Option<V>, usize), InvalidKey> {
-        if let Some((last_key, last_value)) = self.0.last_mut() {
+    ) -> Result<(Option<V>, usize), VecMapError> {
+        if let Some((last_key, last_value)) = self.data.last_mut() {
            match key.cmp(last_key) {
-                Ordering::Less => return Err(InvalidKey),
+                Ordering::Less => return Err(VecMapError::InvalidKey),
                Ordering::Equal => {
                    std::mem::swap(last_value, &mut value);
                    const DELTA_SIZE: usize = 0;
@@ -100,40 +129,67 @@ impl<K: Ord, V> VecMap<K, V> {
        V: Clone,
    {
        let split_idx = self
-            .0
+            .data
            .binary_search_by_key(&cutoff, extract_key)
            .unwrap_or_else(std::convert::identity);

        (
-            VecMap(self.0[..split_idx].to_vec()),
-            VecMap(self.0[split_idx..].to_vec()),
+            VecMap {
+                data: self.data[..split_idx].to_vec(),
+                ordering: self.ordering,
+            },
+            VecMap {
+                data: self.data[split_idx..].to_vec(),
+                ordering: self.ordering,
+            },
        )
    }

    /// Move items from `other` to the end of `self`, leaving `other` empty.
-    /// If any keys in `other` is less than or equal to any key in `self`,
-    /// `InvalidKey` error will be returned and no mutation will occur.
-    pub fn extend(&mut self, other: &mut Self) -> Result<usize, InvalidKey> {
-        let self_last_opt = self.0.last().map(extract_key);
-        let other_first_opt = other.0.last().map(extract_key);
+    /// If the `other` ordering is different from `self` ordering
+    /// `ExtendOrderingError` error will be returned.
+    /// If any keys in `other` is not respective of the ordering defined in
+    /// `self`, `InvalidKey` error will be returned and no mutation will occur.
+    pub fn extend(&mut self, other: &mut Self) -> Result<usize, VecMapError> {
+        if self.ordering != other.ordering {
+            return Err(VecMapError::ExtendOrderingError);
+        }

-        if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) {
-            if self_last >= other_first {
-                return Err(InvalidKey);
+        let other_first_opt = other.data.last().map(extract_key);
+        if let Some(other_first) = other_first_opt {
+            self.validate_key_order(other_first)?;
+        }
+
+        let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.data));
+        Ok(delta_size)
+    }
+
+    /// Validate the current last key in `self` and key being
+    /// inserted against the order defined in `self`.
+    fn validate_key_order(&self, key: &K) -> Result<(), VecMapError> {
+        if let Some(last_key) = self.data.last().map(extract_key) {
+            match (&self.ordering, &key.cmp(last_key)) {
+                (VecMapOrdering::Greater, Ordering::Less | Ordering::Equal) => {
+                    return Err(VecMapError::InvalidKey);
+                }
+                (VecMapOrdering::Greater, Ordering::Greater) => {}
+                (VecMapOrdering::GreaterOrEqual, Ordering::Less) => {
+                    return Err(VecMapError::InvalidKey);
+                }
+                (VecMapOrdering::GreaterOrEqual, Ordering::Equal | Ordering::Greater) => {}
            }
        }

-        let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.0));
-        Ok(delta_size)
+        Ok(())
    }

    /// Instrument an operation on the underlying [`Vec`].
    /// Will panic if the operation decreases capacity.
    /// Returns the increase in memory usage caused by the op.
    fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize {
-        let old_cap = self.0.capacity();
-        op(&mut self.0);
-        let new_cap = self.0.capacity();
+        let old_cap = self.data.capacity();
+        op(&mut self.data);
+        let new_cap = self.data.capacity();

        match old_cap.cmp(&new_cap) {
            Ordering::Less => {
@@ -145,6 +201,36 @@ impl<K: Ord, V> VecMap<K, V> {
            Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"),
        }
    }
+
+    /// Similar to `from_iter` defined in `FromIter` trait except
+    /// that it accepts an [`VecMapOrdering`]
+    pub fn from_iter<I: IntoIterator<Item = (K, V)>>(iter: I, ordering: VecMapOrdering) -> Self {
+        let iter = iter.into_iter();
+        let initial_capacity = {
+            match iter.size_hint() {
+                (lower_bound, None) => lower_bound,
+                (_, Some(upper_bound)) => upper_bound,
+            }
+        };
+
+        let mut vec_map = VecMap::with_capacity(initial_capacity, ordering);
+        for (key, value) in iter {
+            vec_map
+                .append(key, value)
+                .expect("The passed collection needs to be sorted!");
+        }
+
+        vec_map
+    }
+}
+
+impl<K: Ord, V> IntoIterator for VecMap<K, V> {
+    type Item = (K, V);
+    type IntoIter = std::vec::IntoIter<(K, V)>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.data.into_iter()
+    }
 }

 fn extract_key<K, V>(entry: &(K, V)) -> &K {
@@ -155,7 +241,7 @@ fn extract_key<K, V>(entry: &(K, V)) -> &K {
 mod tests {
    use std::{collections::BTreeMap, ops::Bound};

-    use super::VecMap;
+    use super::{VecMap, VecMapOrdering};

    #[test]
    fn unbounded_range() {
@@ -310,5 +396,59 @@ mod tests {
        left.extend(&mut one_map).unwrap_err();
        assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
        assert_eq!(one_map.as_slice(), &[(1, ())]);
+
+        let mut map_greater_or_equal = VecMap::new(VecMapOrdering::GreaterOrEqual);
+        map_greater_or_equal.append(2, ()).unwrap();
+        map_greater_or_equal.append(2, ()).unwrap();
+
+        left.extend(&mut map_greater_or_equal).unwrap_err();
+        assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
+        assert_eq!(map_greater_or_equal.as_slice(), &[(2, ()), (2, ())]);
+    }
+
+    #[test]
+    fn extend_with_ordering() {
+        let mut left = VecMap::new(VecMapOrdering::GreaterOrEqual);
+        left.append(0, ()).unwrap();
+        assert_eq!(left.as_slice(), &[(0, ())]);
+
+        let mut greater_right = VecMap::new(VecMapOrdering::Greater);
+        greater_right.append(0, ()).unwrap();
+        left.extend(&mut greater_right).unwrap_err();
+        assert_eq!(left.as_slice(), &[(0, ())]);
+
+        let mut greater_or_equal_right = VecMap::new(VecMapOrdering::GreaterOrEqual);
+        greater_or_equal_right.append(2, ()).unwrap();
+        greater_or_equal_right.append(2, ()).unwrap();
+        left.extend(&mut greater_or_equal_right).unwrap();
+        assert_eq!(left.as_slice(), &[(0, ()), (2, ()), (2, ())]);
+    }
+
+    #[test]
+    fn vec_map_from_sorted() {
+        let vec = vec![(1, ()), (2, ()), (3, ()), (6, ())];
+        let vec_map = VecMap::from_iter(vec, VecMapOrdering::Greater);
+        assert_eq!(vec_map.as_slice(), &[(1, ()), (2, ()), (3, ()), (6, ())]);
+
+        let vec = vec![(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())];
+        let vec_map = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
+        assert_eq!(
+            vec_map.as_slice(),
+            &[(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())]
+        );
+    }
+
+    #[test]
+    #[should_panic]
+    fn vec_map_from_unsorted_greater() {
+        let vec = vec![(1, ()), (2, ()), (2, ()), (3, ()), (6, ())];
+        let _ = VecMap::from_iter(vec, VecMapOrdering::Greater);
+    }
+
+    #[test]
+    #[should_panic]
+    fn vec_map_from_unsorted_greater_or_equal() {
+        let vec = vec![(1, ()), (2, ()), (3, ()), (6, ()), (5, ())];
+        let _ = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
    }
 }
--- a/libs/utils/src/zstd.rs
+++ b/libs/utils/src/zstd.rs
@@ -0,0 +1,78 @@
+use std::io::SeekFrom;
+
+use anyhow::{Context, Result};
+use async_compression::{
+    tokio::{bufread::ZstdDecoder, write::ZstdEncoder},
+    zstd::CParameter,
+    Level,
+};
+use camino::Utf8Path;
+use nix::NixPath;
+use tokio::{
+    fs::{File, OpenOptions},
+    io::AsyncBufRead,
+    io::AsyncSeekExt,
+    io::AsyncWriteExt,
+};
+use tokio_tar::{Archive, Builder, HeaderMode};
+use walkdir::WalkDir;
+
+/// Creates a Zstandard tarball.
+pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> {
+    let file = OpenOptions::new()
+        .create(true)
+        .truncate(true)
+        .read(true)
+        .write(true)
+        .open(&tarball)
+        .await
+        .with_context(|| format!("tempfile creation {tarball}"))?;
+
+    let mut paths = Vec::new();
+    for entry in WalkDir::new(path) {
+        let entry = entry?;
+        let metadata = entry.metadata().expect("error getting dir entry metadata");
+        // Also allow directories so that we also get empty directories
+        if !(metadata.is_file() || metadata.is_dir()) {
+            continue;
+        }
+        let path = entry.into_path();
+        paths.push(path);
+    }
+    // Do a sort to get a more consistent listing
+    paths.sort_unstable();
+    let zstd = ZstdEncoder::with_quality_and_params(
+        file,
+        Level::Default,
+        &[CParameter::enable_long_distance_matching(true)],
+    );
+    let mut builder = Builder::new(zstd);
+    // Use reproducible header mode
+    builder.mode(HeaderMode::Deterministic);
+    for p in paths {
+        let rel_path = p.strip_prefix(path)?;
+        if rel_path.is_empty() {
+            // The top directory should not be compressed,
+            // the tar crate doesn't like that
+            continue;
+        }
+        builder.append_path_with_name(&p, rel_path).await?;
+    }
+    let mut zstd = builder.into_inner().await?;
+    zstd.shutdown().await?;
+    let mut compressed = zstd.into_inner();
+    let compressed_len = compressed.metadata().await?.len();
+    compressed.seek(SeekFrom::Start(0)).await?;
+    Ok((compressed, compressed_len))
+}
+
+/// Creates a Zstandard tarball.
+pub async fn extract_zst_tarball(
+    path: &Utf8Path,
+    tarball: impl AsyncBufRead + Unpin,
+) -> Result<()> {
+    let decoder = Box::pin(ZstdDecoder::new(tarball));
+    let mut archive = Archive::new(decoder);
+    archive.unpack(path).await?;
+    Ok(())
+}
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -324,11 +324,11 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
    }
 }

-extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer) {
+extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekeeper) {
    unsafe {
        let callback_data = (*(*wp).config).callback_data;
        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).process_safekeeper_feedback(&mut (*wp))
+        (*api).process_safekeeper_feedback(&mut (*wp), &mut (*sk));
    }
 }

--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -142,7 +142,7 @@ pub trait ApiImpl {
        todo!()
    }

-    fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer) {
+    fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer, _sk: &mut Safekeeper) {
        todo!()
    }

--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -1,160 +1,156 @@
-//! Simple benchmarking around walredo.
+//! Quantify a single walredo manager's throughput under N concurrent callers.
 //!
-//! Right now they hope to just set a baseline. Later we can try to expand into latency and
-//! throughput after figuring out the coordinated omission problems below.
+//! The benchmark implementation ([`bench_impl`]) is parametrized by
+//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`]
+//! - `n_redos` => number of times the benchmark shell execute the `redo_work`
+//! - `nclients` => number of clients (more on this shortly).
 //!
-//! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by
-//! logging what happens when a sequential scan is requested on a small table, then picking out two
-//! suitable from logs.
+//! The benchmark impl sets up a multi-threaded tokio runtime with default parameters.
+//! It spawns `nclients` times [`client`] tokio tasks.
+//! Each task executes the `redo_work` `n_redos/nclients` times.
 //!
+//! We exercise the following combinations:
+//! - `redo_work = short / medium``
+//! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]`
 //!
-//! Reference data (git blame to see commit) on an i3en.3xlarge
-// ```text
-//! short/short/1           time:   [39.175 µs 39.348 µs 39.536 µs]
-//! short/short/2           time:   [51.227 µs 51.487 µs 51.755 µs]
-//! short/short/4           time:   [76.048 µs 76.362 µs 76.674 µs]
-//! short/short/8           time:   [128.94 µs 129.82 µs 130.74 µs]
-//! short/short/16          time:   [227.84 µs 229.00 µs 230.28 µs]
-//! short/short/32          time:   [455.97 µs 457.81 µs 459.90 µs]
-//! short/short/64          time:   [902.46 µs 904.84 µs 907.32 µs]
-//! short/short/128         time:   [1.7416 ms 1.7487 ms 1.7561 ms]
-//! ``
-
-use std::sync::Arc;
+//! We let `criterion` determine the `n_redos` using `iter_custom`.
+//! The idea is that for each `(redo_work, nclients)` combination,
+//! criterion will run the `bench_impl` multiple times with different `n_redos`.
+//! The `bench_impl` reports the aggregate wall clock time from the clients' perspective.
+//! Criterion will divide that by `n_redos` to compute the "time per iteration".
+//! In our case, "time per iteration" means "time per redo_work execution".
+//!
+//! NB: the way by which `iter_custom` determines the "number of iterations"
+//! is called sampling. Apparently the idea here is to detect outliers.
+//! We're not sure whether the current choice of sampling method makes sense.
+//! See https://bheisler.github.io/criterion.rs/book/user_guide/command_line_output.html#collecting-samples
+//!
+//! # Reference Numbers
+//!
+//! 2024-03-20 on i3en.3xlarge
+//!
+//! ```text
+//! short/1                 time:   [26.483 µs 26.614 µs 26.767 µs]
+//! short/2                 time:   [32.223 µs 32.465 µs 32.767 µs]
+//! short/4                 time:   [47.203 µs 47.583 µs 47.984 µs]
+//! short/8                 time:   [89.135 µs 89.612 µs 90.139 µs]
+//! short/16                time:   [190.12 µs 191.52 µs 192.88 µs]
+//! short/32                time:   [380.96 µs 382.63 µs 384.20 µs]
+//! short/64                time:   [736.86 µs 741.07 µs 745.03 µs]
+//! short/128               time:   [1.4106 ms 1.4206 ms 1.4294 ms]
+//! medium/1                time:   [111.81 µs 112.25 µs 112.79 µs]
+//! medium/2                time:   [158.26 µs 159.13 µs 160.21 µs]
+//! medium/4                time:   [334.65 µs 337.14 µs 340.07 µs]
+//! medium/8                time:   [675.32 µs 679.91 µs 685.25 µs]
+//! medium/16               time:   [1.2929 ms 1.2996 ms 1.3067 ms]
+//! medium/32               time:   [2.4295 ms 2.4461 ms 2.4623 ms]
+//! medium/64               time:   [4.3973 ms 4.4458 ms 4.4875 ms]
+//! medium/128              time:   [7.5955 ms 7.7847 ms 7.9481 ms]
+//! ```

 use bytes::{Buf, Bytes};
-use pageserver::{
-    config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
+use criterion::{BenchmarkId, Criterion};
+use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
+use pageserver_api::{key::Key, shard::TenantShardId};
+use std::{
+    sync::Arc,
+    time::{Duration, Instant},
 };
-use pageserver_api::shard::TenantShardId;
-use tokio::task::JoinSet;
+use tokio::{sync::Barrier, task::JoinSet};
 use utils::{id::TenantId, lsn::Lsn};

-use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+fn bench(c: &mut Criterion) {
+    {
+        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+        for nclients in nclients {
+            let mut group = c.benchmark_group("short");
+            group.bench_with_input(
+                BenchmarkId::from_parameter(nclients),
+                &nclients,
+                |b, nclients| {
+                    let redo_work = Arc::new(Request::short_input());
+                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
+                },
+            );
+        }
+    }

-fn redo_scenarios(c: &mut Criterion) {
-    // logging should be enabled when adding more inputs, since walredo will only report malformed
-    // input to the stderr.
-    // utils::logging::init(utils::logging::LogFormat::Plain).unwrap();
+    {
+        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+        for nclients in nclients {
+            let mut group = c.benchmark_group("medium");
+            group.bench_with_input(
+                BenchmarkId::from_parameter(nclients),
+                &nclients,
+                |b, nclients| {
+                    let redo_work = Arc::new(Request::medium_input());
+                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
+                },
+            );
+        }
+    }
+}
+criterion::criterion_group!(benches, bench);
+criterion::criterion_main!(benches);

+// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
+fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
    let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();

    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
    let conf = Box::leak(Box::new(conf));
    let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());

-    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
-
-    let manager = Arc::new(manager);
-
-    {
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .unwrap();
-        tracing::info!("executing first");
-        rt.block_on(short().execute(&manager)).unwrap();
-        tracing::info!("first executed");
-    }
-
-    let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128];
-
-    let mut group = c.benchmark_group("short");
-    group.sampling_mode(criterion::SamplingMode::Flat);
-
-    for thread_count in thread_counts {
-        group.bench_with_input(
-            BenchmarkId::new("short", thread_count),
-            &thread_count,
-            |b, thread_count| {
-                add_multithreaded_walredo_requesters(b, *thread_count, &manager, short);
-            },
-        );
-    }
-    drop(group);
-
-    let mut group = c.benchmark_group("medium");
-    group.sampling_mode(criterion::SamplingMode::Flat);
-
-    for thread_count in thread_counts {
-        group.bench_with_input(
-            BenchmarkId::new("medium", thread_count),
-            &thread_count,
-            |b, thread_count| {
-                add_multithreaded_walredo_requesters(b, *thread_count, &manager, medium);
-            },
-        );
-    }
-    drop(group);
-}
-
-/// Sets up a multi-threaded tokio runtime with default worker thread count,
-/// then, spawn `requesters` tasks that repeatedly:
-/// - get input from `input_factor()`
-/// - call `manager.request_redo()` with their input
-///
-/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency.
-///
-/// Using tokio's default worker thread count means the results will differ on machines
-/// with different core countrs. We don't care about that, the performance will always
-/// be different on different hardware. To compare performance of different software versions,
-/// use the same hardware.
-fn add_multithreaded_walredo_requesters(
-    b: &mut criterion::Bencher,
-    nrequesters: usize,
-    manager: &Arc<PostgresRedoManager>,
-    input_factory: fn() -> Request,
-) {
-    assert_ne!(nrequesters, 0);
-
    let rt = tokio::runtime::Builder::new_multi_thread()
        .enable_all()
        .build()
        .unwrap();

-    let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1));
+    let start = Arc::new(Barrier::new(nclients as usize));

-    let mut requesters = JoinSet::new();
-    for _ in 0..nrequesters {
-        let _entered = rt.enter();
-        let manager = manager.clone();
-        let barrier = barrier.clone();
-        requesters.spawn(async move {
-            loop {
-                let input = input_factory();
-                barrier.wait().await;
-                let page = input.execute(&manager).await.unwrap();
-                assert_eq!(page.remaining(), 8192);
-                barrier.wait().await;
-            }
+    let mut tasks = JoinSet::new();
+
+    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
+    let manager = Arc::new(manager);
+
+    for _ in 0..nclients {
+        rt.block_on(async {
+            tasks.spawn(client(
+                Arc::clone(&manager),
+                Arc::clone(&start),
+                Arc::clone(&redo_work),
+                // divide the amount of work equally among the clients
+                n_redos / nclients,
+            ))
        });
    }

-    let do_one_iteration = || {
-        rt.block_on(async {
-            barrier.wait().await;
-            // wait for work to complete
-            barrier.wait().await;
-        })
-    };
-
-    b.iter_batched(
-        || {
-            // warmup
-            do_one_iteration();
-        },
-        |()| {
-            // work loop
-            do_one_iteration();
-        },
-        criterion::BatchSize::PerIteration,
-    );
-
-    rt.block_on(requesters.shutdown());
+    rt.block_on(async move {
+        let mut total_wallclock_time = std::time::Duration::from_millis(0);
+        while let Some(res) = tasks.join_next().await {
+            total_wallclock_time += res.unwrap();
+        }
+        total_wallclock_time
+    })
 }

-criterion_group!(benches, redo_scenarios);
-criterion_main!(benches);
+async fn client(
+    mgr: Arc<PostgresRedoManager>,
+    start: Arc<Barrier>,
+    redo_work: Arc<Request>,
+    n_redos: u64,
+) -> Duration {
+    start.wait().await;
+    let start = Instant::now();
+    for _ in 0..n_redos {
+        let page = redo_work.execute(&mgr).await.unwrap();
+        assert_eq!(page.remaining(), 8192);
+        // The real pageserver will rarely if ever do 2 walredos in a row without
+        // yielding to the executor.
+        tokio::task::yield_now().await;
+    }
+    start.elapsed()
+}

 macro_rules! lsn {
    ($input:expr) => {{
@@ -166,12 +162,46 @@ macro_rules! lsn {
    }};
 }

-/// Short payload, 1132 bytes.
-// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
-// for null bytes.
-#[allow(clippy::octal_escapes)]
-fn short() -> Request {
-    Request {
+/// Simple wrapper around `WalRedoManager::request_redo`.
+///
+/// In benchmarks this is cloned around.
+#[derive(Clone)]
+struct Request {
+    key: Key,
+    lsn: Lsn,
+    base_img: Option<(Lsn, Bytes)>,
+    records: Vec<(Lsn, NeonWalRecord)>,
+    pg_version: u32,
+}
+
+impl Request {
+    async fn execute(&self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
+        let Request {
+            key,
+            lsn,
+            base_img,
+            records,
+            pg_version,
+        } = self;
+
+        // TODO: avoid these clones
+        manager
+            .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
+            .await
+    }
+
+    fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
+        let rec = Bytes::from_static(bytes);
+        NeonWalRecord::Postgres { will_init, rec }
+    }
+
+    /// Short payload, 1132 bytes.
+    // pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
+    // for null bytes.
+    #[allow(clippy::octal_escapes)]
+    pub fn short_input() -> Request {
+        let pg_record = Self::pg_record;
+        Request {
        key: Key {
            field1: 0,
            field2: 1663,
@@ -194,13 +224,14 @@ fn short() -> Request {
        ],
        pg_version: 14,
    }
-}
+    }

-/// Medium sized payload, serializes as 26393 bytes.
-// see [`short`]
-#[allow(clippy::octal_escapes)]
-fn medium() -> Request {
-    Request {
+    /// Medium sized payload, serializes as 26393 bytes.
+    // see [`short`]
+    #[allow(clippy::octal_escapes)]
+    pub fn medium_input() -> Request {
+        let pg_record = Self::pg_record;
+        Request {
        key: Key {
            field1: 0,
            field2: 1663,
@@ -442,37 +473,5 @@ fn medium() -> Request {
        ],
        pg_version: 14,
    }
-}
-
-fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
-    let rec = Bytes::from_static(bytes);
-    NeonWalRecord::Postgres { will_init, rec }
-}
-
-/// Simple wrapper around `WalRedoManager::request_redo`.
-///
-/// In benchmarks this is cloned around.
-#[derive(Clone)]
-struct Request {
-    key: Key,
-    lsn: Lsn,
-    base_img: Option<(Lsn, Bytes)>,
-    records: Vec<(Lsn, NeonWalRecord)>,
-    pg_version: u32,
-}
-
-impl Request {
-    async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
-        let Request {
-            key,
-            lsn,
-            base_img,
-            records,
-            pg_version,
-        } = self;
-
-        manager
-            .request_redo(key, lsn, base_img, records, pg_version)
-            .await
    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -15,9 +15,9 @@ use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
 use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
-use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use pageserver::tenant::{secondary, TenantSharedResources};
 use remote_storage::GenericRemoteStorage;
+use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
 use tracing::*;

@@ -28,7 +28,7 @@ use pageserver::{
    deletion_queue::DeletionQueue,
    http, page_cache, page_service, task_mgr,
    task_mgr::TaskKind,
-    task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
+    task_mgr::THE_RUNTIME,
    tenant::mgr,
    virtual_file,
 };
@@ -323,7 +323,7 @@ fn start_pageserver(

    // Launch broker client
    // The storage_broker::connect call needs to happen inside a tokio runtime thread.
-    let broker_client = WALRECEIVER_RUNTIME
+    let broker_client = THE_RUNTIME
        .block_on(async {
            // Note: we do not attempt connecting here (but validate endpoints sanity).
            storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)
@@ -391,7 +391,7 @@ fn start_pageserver(
        conf,
    );
    if let Some(deletion_workers) = deletion_workers {
-        deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
+        deletion_workers.spawn_with(THE_RUNTIME.handle());
    }

    // Up to this point no significant I/O has been done: this should have been fast.  Record
@@ -423,7 +423,7 @@ fn start_pageserver(

    // Scan the local 'tenants/' directory and start loading the tenants
    let deletion_queue_client = deletion_queue.new_client();
-    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
+    let tenant_manager = THE_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
        TenantSharedResources {
            broker_client: broker_client.clone(),
@@ -435,7 +435,7 @@ fn start_pageserver(
    ))?;
    let tenant_manager = Arc::new(tenant_manager);

-    BACKGROUND_RUNTIME.spawn({
+    THE_RUNTIME.spawn({
        let shutdown_pageserver = shutdown_pageserver.clone();
        let drive_init = async move {
            // NOTE: unlike many futures in pageserver, this one is cancellation-safe
@@ -545,7 +545,7 @@ fn start_pageserver(
    // Start up the service to handle HTTP mgmt API request. We created the
    // listener earlier already.
    {
-        let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
+        let _rt_guard = THE_RUNTIME.enter();

        let router_state = Arc::new(
            http::routes::State::new(
@@ -569,7 +569,6 @@ fn start_pageserver(
            .with_graceful_shutdown(task_mgr::shutdown_watcher());

        task_mgr::spawn(
-            MGMT_REQUEST_RUNTIME.handle(),
            TaskKind::HttpEndpointListener,
            None,
            None,
@@ -594,7 +593,6 @@ fn start_pageserver(
        let local_disk_storage = conf.workdir.join("last_consumption_metrics.json");

        task_mgr::spawn(
-            crate::BACKGROUND_RUNTIME.handle(),
            TaskKind::MetricsCollection,
            None,
            None,
@@ -615,6 +613,7 @@ fn start_pageserver(

                pageserver::consumption_metrics::collect_metrics(
                    metric_collection_endpoint,
+                    &conf.metric_collection_bucket,
                    conf.metric_collection_interval,
                    conf.cached_metric_collection_interval,
                    conf.synthetic_size_calculation_interval,
@@ -642,7 +641,6 @@ fn start_pageserver(
            DownloadBehavior::Error,
        );
        task_mgr::spawn(
-            COMPUTE_REQUEST_RUNTIME.handle(),
            TaskKind::LibpqEndpointListener,
            None,
            None,
@@ -666,42 +664,37 @@ fn start_pageserver(
    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());

    // All started up! Now just sit and wait for shutdown signal.
-    {
-        use signal_hook::consts::*;
-        let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
-            let mut signals =
-                signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
-            return signals
-                .forever()
-                .next()
-                .expect("forever() never returns None unless explicitly closed");
-        });
-        let signal = BACKGROUND_RUNTIME
-            .block_on(signal_handler)
-            .expect("join error");
-        match signal {
-            SIGQUIT => {
-                info!("Got signal {signal}. Terminating in immediate shutdown mode",);
-                std::process::exit(111);
-            }
-            SIGINT | SIGTERM => {
-                info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);

-                // This cancels the `shutdown_pageserver` cancellation tree.
-                // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
-                // The plan is to change that over time.
-                shutdown_pageserver.take();
-                let bg_remote_storage = remote_storage.clone();
-                let bg_deletion_queue = deletion_queue.clone();
-                BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
-                    &tenant_manager,
-                    bg_remote_storage.map(|_| bg_deletion_queue),
-                    0,
-                ));
-                unreachable!()
-            }
-            _ => unreachable!(),
-        }
+    {
+        THE_RUNTIME.block_on(async move {
+            let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
+            let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
+            let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
+            let signal = tokio::select! {
+                _ = sigquit.recv() => {
+                    info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",);
+                    std::process::exit(111);
+                }
+                _ = sigint.recv() => { "SIGINT" },
+                _ = sigterm.recv() => { "SIGTERM" },
+            };
+
+            info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
+
+            // This cancels the `shutdown_pageserver` cancellation tree.
+            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
+            // The plan is to change that over time.
+            shutdown_pageserver.take();
+            let bg_remote_storage = remote_storage.clone();
+            let bg_deletion_queue = deletion_queue.clone();
+            pageserver::shutdown_pageserver(
+                &tenant_manager,
+                bg_remote_storage.map(|_| bg_deletion_queue),
+                0,
+            )
+            .await;
+            unreachable!()
+        })
    }
 }

--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -234,6 +234,7 @@ pub struct PageServerConf {
    // How often to send unchanged cached metrics to the metrics endpoint.
    pub cached_metric_collection_interval: Duration,
    pub metric_collection_endpoint: Option<Url>,
+    pub metric_collection_bucket: Option<RemoteStorageConfig>,
    pub synthetic_size_calculation_interval: Duration,

    pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
@@ -373,6 +374,7 @@ struct PageServerConfigBuilder {
    cached_metric_collection_interval: BuilderValue<Duration>,
    metric_collection_endpoint: BuilderValue<Option<Url>>,
    synthetic_size_calculation_interval: BuilderValue<Duration>,
+    metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,

    disk_usage_based_eviction: BuilderValue<Option<DiskUsageEvictionTaskConfig>>,

@@ -455,6 +457,8 @@ impl PageServerConfigBuilder {
            .expect("cannot parse default synthetic size calculation interval")),
            metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),

+            metric_collection_bucket: Set(None),
+
            disk_usage_based_eviction: Set(None),

            test_remote_failures: Set(0),
@@ -586,6 +590,13 @@ impl PageServerConfigBuilder {
        self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
    }

+    pub fn metric_collection_bucket(
+        &mut self,
+        metric_collection_bucket: Option<RemoteStorageConfig>,
+    ) {
+        self.metric_collection_bucket = BuilderValue::Set(metric_collection_bucket)
+    }
+
    pub fn synthetic_size_calculation_interval(
        &mut self,
        synthetic_size_calculation_interval: Duration,
@@ -694,6 +705,7 @@ impl PageServerConfigBuilder {
                metric_collection_interval,
                cached_metric_collection_interval,
                metric_collection_endpoint,
+                metric_collection_bucket,
                synthetic_size_calculation_interval,
                disk_usage_based_eviction,
                test_remote_failures,
@@ -942,6 +954,9 @@ impl PageServerConf {
                    let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
                    builder.metric_collection_endpoint(Some(endpoint));
                },
+                "metric_collection_bucket" => {
+                    builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?)
+                }
                "synthetic_size_calculation_interval" =>
                    builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
                "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
@@ -1057,6 +1072,7 @@ impl PageServerConf {
            metric_collection_interval: Duration::from_secs(60),
            cached_metric_collection_interval: Duration::from_secs(60 * 60),
            metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
+            metric_collection_bucket: None,
            synthetic_size_calculation_interval: Duration::from_secs(60),
            disk_usage_based_eviction: None,
            test_remote_failures: 0,
@@ -1289,6 +1305,7 @@ background_task_maximum_delay = '334 s'
                    defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
                )?,
                metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
+                metric_collection_bucket: None,
                synthetic_size_calculation_interval: humantime::parse_duration(
                    defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
                )?,
@@ -1363,6 +1380,7 @@ background_task_maximum_delay = '334 s'
                metric_collection_interval: Duration::from_secs(222),
                cached_metric_collection_interval: Duration::from_secs(22200),
                metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
+                metric_collection_bucket: None,
                synthetic_size_calculation_interval: Duration::from_secs(333),
                disk_usage_based_eviction: None,
                test_remote_failures: 0,
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -1,12 +1,13 @@
 //! Periodically collect consumption metrics for all active tenants
 //! and push them to a HTTP endpoint.
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
+use crate::task_mgr::{self, TaskKind};
 use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
+use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
 use reqwest::Url;
 use std::collections::HashMap;
 use std::sync::Arc;
@@ -41,6 +42,7 @@ type Cache = HashMap<MetricsKey, (EventType, u64)>;
 #[allow(clippy::too_many_arguments)]
 pub async fn collect_metrics(
    metric_collection_endpoint: &Url,
+    metric_collection_bucket: &Option<RemoteStorageConfig>,
    metric_collection_interval: Duration,
    _cached_metric_collection_interval: Duration,
    synthetic_size_calculation_interval: Duration,
@@ -59,7 +61,6 @@ pub async fn collect_metrics(
    let worker_ctx =
        ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
        TaskKind::CalculateSyntheticSize,
        None,
        None,
@@ -94,6 +95,20 @@ pub async fn collect_metrics(
        .build()
        .expect("Failed to create http client with timeout");

+    let bucket_client = if let Some(bucket_config) = metric_collection_bucket {
+        match GenericRemoteStorage::from_config(bucket_config) {
+            Ok(client) => Some(client),
+            Err(e) => {
+                // Non-fatal error: if we were given an invalid config, we will proceed
+                // with sending metrics over the network, but not to S3.
+                tracing::warn!("Invalid configuration for metric_collection_bucket: {e}");
+                None
+            }
+        }
+    } else {
+        None
+    };
+
    let node_id = node_id.to_string();

    loop {
@@ -118,10 +133,18 @@ pub async fn collect_metrics(
                    tracing::error!("failed to persist metrics to {path:?}: {e:#}");
                }
            }
+
+            if let Some(bucket_client) = &bucket_client {
+                let res =
+                    upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await;
+                if let Err(e) = res {
+                    tracing::error!("failed to upload to S3: {e:#}");
+                }
+            }
        };

        let upload = async {
-            let res = upload::upload_metrics(
+            let res = upload::upload_metrics_http(
                &client,
                metric_collection_endpoint,
                &cancel,
@@ -132,7 +155,7 @@ pub async fn collect_metrics(
            .await;
            if let Err(e) = res {
                // serialization error which should never happen
-                tracing::error!("failed to upload due to {e:#}");
+                tracing::error!("failed to upload via HTTP due to {e:#}");
            }
        };

--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -1,4 +1,9 @@
+use std::time::SystemTime;
+
+use chrono::{DateTime, Utc};
 use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
+use remote_storage::{GenericRemoteStorage, RemotePath};
+use tokio::io::AsyncWriteExt;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;

@@ -13,8 +18,9 @@ struct Ids {
    pub(super) timeline_id: Option<TimelineId>,
 }

+/// Serialize and write metrics to an HTTP endpoint
 #[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
-pub(super) async fn upload_metrics(
+pub(super) async fn upload_metrics_http(
    client: &reqwest::Client,
    metric_collection_endpoint: &reqwest::Url,
    cancel: &CancellationToken,
@@ -74,6 +80,60 @@ pub(super) async fn upload_metrics(
    Ok(())
 }

+/// Serialize and write metrics to a remote storage object
+#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
+pub(super) async fn upload_metrics_bucket(
+    client: &GenericRemoteStorage,
+    cancel: &CancellationToken,
+    node_id: &str,
+    metrics: &[RawMetric],
+) -> anyhow::Result<()> {
+    if metrics.is_empty() {
+        // Skip uploads if we have no metrics, so that readers don't have to handle the edge case
+        // of an empty object.
+        return Ok(());
+    }
+
+    // Compose object path
+    let datetime: DateTime<Utc> = SystemTime::now().into();
+    let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/%H:%M:%SZ");
+    let path = RemotePath::from_string(&format!("{ts_prefix}_{node_id}.ndjson.gz"))?;
+
+    // Set up a gzip writer into a buffer
+    let mut compressed_bytes: Vec<u8> = Vec::new();
+    let compressed_writer = std::io::Cursor::new(&mut compressed_bytes);
+    let mut gzip_writer = async_compression::tokio::write::GzipEncoder::new(compressed_writer);
+
+    // Serialize and write into compressed buffer
+    let started_at = std::time::Instant::now();
+    for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) {
+        let (_chunk, body) = res?;
+        gzip_writer.write_all(&body).await?;
+    }
+    gzip_writer.flush().await?;
+    gzip_writer.shutdown().await?;
+    let compressed_length = compressed_bytes.len();
+
+    // Write to remote storage
+    client
+        .upload_storage_object(
+            futures::stream::once(futures::future::ready(Ok(compressed_bytes.into()))),
+            compressed_length,
+            &path,
+            cancel,
+        )
+        .await?;
+    let elapsed = started_at.elapsed();
+
+    tracing::info!(
+        compressed_length,
+        elapsed_ms = elapsed.as_millis(),
+        "write metrics bucket at {path}",
+    );
+
+    Ok(())
+}
+
 // The return type is quite ugly, but we gain testability in isolation
 fn serialize_in_chunks<'a, F>(
    chunk_size: usize,
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -5,7 +5,8 @@ use pageserver_api::{
    controller_api::NodeRegisterRequest,
    shard::TenantShardId,
    upcall_api::{
-        ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
+        ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
+        ValidateRequestTenant, ValidateResponse,
    },
 };
 use serde::{de::DeserializeOwned, Serialize};
@@ -37,7 +38,9 @@ pub trait ControlPlaneGenerationsApi {
    fn re_attach(
        &self,
        conf: &PageServerConf,
-    ) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
+    ) -> impl Future<
+        Output = Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError>,
+    > + Send;
    fn validate(
        &self,
        tenants: Vec<(TenantShardId, Generation)>,
@@ -118,7 +121,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
    async fn re_attach(
        &self,
        conf: &PageServerConf,
-    ) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
+    ) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
        let re_attach_path = self
            .base_url
            .join("re-attach")
@@ -170,8 +173,6 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
            register,
        };

-        fail::fail_point!("control-plane-client-re-attach");
-
        let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
        tracing::info!(
            "Received re-attach response with {} tenants",
@@ -181,7 +182,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
        Ok(response
            .tenants
            .into_iter()
-            .map(|t| (t.id, Generation::new(t.gen)))
+            .map(|rart| (rart.id, rart))
            .collect::<HashMap<_, _>>())
    }

@@ -207,7 +208,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                .collect(),
        };

-        fail::fail_point!("control-plane-client-validate");
+        crate::tenant::pausable_failpoint!("control-plane-client-validate");

        let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;

--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -724,8 +724,8 @@ impl DeletionQueue {
 mod test {
    use camino::Utf8Path;
    use hex_literal::hex;
-    use pageserver_api::shard::ShardIndex;
-    use std::io::ErrorKind;
+    use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant};
+    use std::{io::ErrorKind, time::Duration};
    use tracing::info;

    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
@@ -834,9 +834,10 @@ mod test {
        async fn re_attach(
            &self,
            _conf: &PageServerConf,
-        ) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
+        ) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
            unimplemented!()
        }
+
        async fn validate(
            &self,
            tenants: Vec<(TenantShardId, Generation)>,
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -59,7 +59,7 @@ use utils::{completion, id::TimelineId};
 use crate::{
    config::PageServerConf,
    metrics::disk_usage_based_eviction::METRICS,
-    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
+    task_mgr::{self, TaskKind},
    tenant::{
        self,
        mgr::TenantManager,
@@ -202,7 +202,6 @@ pub fn launch_disk_usage_global_eviction_task(
    info!("launching disk usage based eviction task");

    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
        TaskKind::DiskUsageEviction,
        None,
        None,
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -36,6 +36,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
+use utils::http::endpoint::prometheus_metrics_handler;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -885,14 +886,16 @@ async fn tenant_detach_handler(

    let state = get_state(&request);
    let conf = state.conf;
-    mgr::detach_tenant(
-        conf,
-        tenant_shard_id,
-        detach_ignored.unwrap_or(false),
-        &state.deletion_queue_client,
-    )
-    .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
-    .await?;
+    state
+        .tenant_manager
+        .detach_tenant(
+            conf,
+            tenant_shard_id,
+            detach_ignored.unwrap_or(false),
+            &state.deletion_queue_client,
+        )
+        .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
+        .await?;

    json_response(StatusCode::OK, ())
 }
@@ -1403,7 +1406,9 @@ async fn update_tenant_config_handler(
        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

    let state = get_state(&request);
-    mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
+    state
+        .tenant_manager
+        .set_new_tenant_config(tenant_conf, tenant_id)
        .instrument(info_span!("tenant_config", %tenant_id))
        .await?;

@@ -1428,13 +1433,14 @@ async fn put_tenant_location_config_handler(
    // The `Detached` state is special, it doesn't upsert a tenant, it removes
    // its local disk content and drops it from memory.
    if let LocationConfigMode::Detached = request_data.config.mode {
-        if let Err(e) =
-            mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
-                .instrument(info_span!("tenant_detach",
-                    tenant_id = %tenant_shard_id.tenant_id,
-                    shard_id = %tenant_shard_id.shard_slug()
-                ))
-                .await
+        if let Err(e) = state
+            .tenant_manager
+            .detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
+            .instrument(info_span!("tenant_detach",
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug()
+            ))
+            .await
        {
            match e {
                TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
@@ -1648,8 +1654,7 @@ async fn timeline_gc_handler(
    let gc_req: TimelineGcRequest = json_request(&mut request).await?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let wait_task_done =
-        mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
+    let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?;
    let gc_result = wait_task_done
        .await
        .context("wait for gc task")
@@ -2262,6 +2267,7 @@ pub fn make_router(

    Ok(router
        .data(state)
+        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
        .get("/v1/status", |r| api_handler(r, status_handler))
        .put("/v1/failpoints", |r| {
            testing_api_handler("manage failpoints", r, failpoints_handler)
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -2,28 +2,20 @@
 //! Import data and WAL from a PostgreSQL data directory and WAL segments into
 //! a neon Timeline.
 //!
-use std::io::SeekFrom;
 use std::path::{Path, PathBuf};

 use anyhow::{bail, ensure, Context, Result};
-use async_compression::tokio::bufread::ZstdDecoder;
-use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
 use bytes::Bytes;
 use camino::Utf8Path;
 use futures::StreamExt;
-use nix::NixPath;
-use tokio::fs::{File, OpenOptions};
-use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
+use tokio::io::{AsyncRead, AsyncReadExt};
 use tokio_tar::Archive;
-use tokio_tar::Builder;
-use tokio_tar::HeaderMode;
 use tracing::*;
 use walkdir::WalkDir;

 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
-use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
 use crate::walrecord::DecodedWALRecord;
@@ -633,65 +625,3 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
    reader.read_to_end(&mut buf).await?;
    Ok(Bytes::from(buf))
 }
-
-pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
-    let file = OpenOptions::new()
-        .create(true)
-        .truncate(true)
-        .read(true)
-        .write(true)
-        .open(&tmp_path)
-        .await
-        .with_context(|| format!("tempfile creation {tmp_path}"))?;
-
-    let mut paths = Vec::new();
-    for entry in WalkDir::new(pgdata_path) {
-        let entry = entry?;
-        let metadata = entry.metadata().expect("error getting dir entry metadata");
-        // Also allow directories so that we also get empty directories
-        if !(metadata.is_file() || metadata.is_dir()) {
-            continue;
-        }
-        let path = entry.into_path();
-        paths.push(path);
-    }
-    // Do a sort to get a more consistent listing
-    paths.sort_unstable();
-    let zstd = ZstdEncoder::with_quality_and_params(
-        file,
-        Level::Default,
-        &[CParameter::enable_long_distance_matching(true)],
-    );
-    let mut builder = Builder::new(zstd);
-    // Use reproducible header mode
-    builder.mode(HeaderMode::Deterministic);
-    for path in paths {
-        let rel_path = path.strip_prefix(pgdata_path)?;
-        if rel_path.is_empty() {
-            // The top directory should not be compressed,
-            // the tar crate doesn't like that
-            continue;
-        }
-        builder.append_path_with_name(&path, rel_path).await?;
-    }
-    let mut zstd = builder.into_inner().await?;
-    zstd.shutdown().await?;
-    let mut compressed = zstd.into_inner();
-    let compressed_len = compressed.metadata().await?.len();
-    const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
-    if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
-        warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
-    }
-    compressed.seek(SeekFrom::Start(0)).await?;
-    Ok((compressed, compressed_len))
-}
-
-pub async fn extract_tar_zst(
-    pgdata_path: &Utf8Path,
-    tar_zst: impl AsyncBufRead + Unpin,
-) -> Result<()> {
-    let tar = Box::pin(ZstdDecoder::new(tar_zst));
-    let mut archive = Archive::new(tar);
-    archive.unpack(pgdata_path).await?;
-    Ok(())
-}
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -180,7 +180,6 @@ pub async fn libpq_listener_main(
                // only deal with a particular timeline, but we don't know which one
                // yet.
                task_mgr::spawn(
-                    &tokio::runtime::Handle::current(),
                    TaskKind::PageRequestHandler,
                    None,
                    None,
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -34,6 +34,7 @@ use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
+use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};

 const MAX_AUX_FILE_DELTAS: usize = 1024;
@@ -1546,12 +1547,13 @@ impl<'a> DatadirModification<'a> {
        if !self.pending_updates.is_empty() {
            // The put_batch call below expects expects the inputs to be sorted by Lsn,
            // so we do that first.
-            let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = self
-                .pending_updates
-                .drain()
-                .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
-                .kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
-                .collect();
+            let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
+                self.pending_updates
+                    .drain()
+                    .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
+                    .kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
+                VecMapOrdering::GreaterOrEqual,
+            );

            writer.put_batch(lsn_ordered_batch, ctx).await?;
        }
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -98,42 +98,22 @@ use utils::id::TimelineId;
 // other operations, if the upload tasks e.g. get blocked on locks. It shouldn't
 // happen, but still.
 //
-pub static COMPUTE_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("compute request worker")
-        .enable_all()
-        .build()
-        .expect("Failed to create compute request runtime")
-});

-pub static MGMT_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
+/// The single tokio runtime used by all pageserver code.
+/// In the past, we had multiple runtimes, and in the future we should weed out
+/// remaining references to this global field and rely on ambient runtime instead,
+/// i.e., use `tokio::spawn` instead of `THE_RUNTIME.spawn()`, etc.
+pub static THE_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("mgmt request worker")
-        .enable_all()
-        .build()
-        .expect("Failed to create mgmt request runtime")
-});
-
-pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("walreceiver worker")
-        .enable_all()
-        .build()
-        .expect("Failed to create walreceiver runtime")
-});
-
-pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("background op worker")
        // if you change the number of worker threads please change the constant below
        .enable_all()
        .build()
        .expect("Failed to create background op runtime")
 });

-pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
+pub(crate) static THE_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
    // force init and thus panics
-    let _ = BACKGROUND_RUNTIME.handle();
+    let _ = THE_RUNTIME.handle();
    // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
    // tokio would had already panicked for parsing errors or NotUnicode
    //
@@ -325,7 +305,6 @@ struct PageServerTask {
 /// Note: if shutdown_process_on_error is set to true failure
 ///   of the task will lead to shutdown of entire process
 pub fn spawn<F>(
-    runtime: &tokio::runtime::Handle,
    kind: TaskKind,
    tenant_shard_id: Option<TenantShardId>,
    timeline_id: Option<TimelineId>,
@@ -354,7 +333,7 @@ where

    let task_name = name.to_string();
    let task_cloned = Arc::clone(&task);
-    let join_handle = runtime.spawn(task_wrapper(
+    let join_handle = THE_RUNTIME.spawn(task_wrapper(
        task_name,
        task_id,
        task_cloned,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -43,6 +43,8 @@ use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
 use utils::timeout::timeout_cancellable;
 use utils::timeout::TimeoutCancellableError;
+use utils::zstd::create_zst_tarball;
+use utils::zstd::extract_zst_tarball;

 use self::config::AttachedLocationConfig;
 use self::config::AttachmentMode;
@@ -142,6 +144,7 @@ macro_rules! pausable_failpoint {
        }
    };
 }
+pub(crate) use pausable_failpoint;

 pub mod blob_io;
 pub mod block_io;
@@ -200,6 +203,13 @@ pub(super) struct AttachedTenantConf {
 }

 impl AttachedTenantConf {
+    fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self {
+        Self {
+            tenant_conf,
+            location,
+        }
+    }
+
    fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
        match &location_conf.mode {
            LocationMode::Attached(attach_conf) => Ok(Self {
@@ -652,7 +662,6 @@ impl Tenant {
        let tenant_clone = Arc::clone(&tenant);
        let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn);
        task_mgr::spawn(
-            &tokio::runtime::Handle::current(),
            TaskKind::Attach,
            Some(tenant_shard_id),
            None,
@@ -676,9 +685,20 @@ impl Tenant {
                }

                // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
+                enum BrokenVerbosity {
+                    Error,
+                    Info
+                }
                let make_broken =
-                    |t: &Tenant, err: anyhow::Error| {
-                        error!("attach failed, setting tenant state to Broken: {err:?}");
+                    |t: &Tenant, err: anyhow::Error, verbosity: BrokenVerbosity| {
+                        match verbosity {
+                            BrokenVerbosity::Info => {
+                                info!("attach cancelled, setting tenant state to Broken: {err}");
+                            },
+                            BrokenVerbosity::Error => {
+                                error!("attach failed, setting tenant state to Broken: {err:?}");
+                            }
+                        }
                        t.state.send_modify(|state| {
                            // The Stopping case is for when we have passed control on to DeleteTenantFlow:
                            // if it errors, we will call make_broken when tenant is already in Stopping.
@@ -742,7 +762,7 @@ impl Tenant {
                            // Make the tenant broken so that set_stopping will not hang waiting for it to leave
                            // the Attaching state.  This is an over-reaction (nothing really broke, the tenant is
                            // just shutting down), but ensures progress.
-                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
+                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"), BrokenVerbosity::Info);
                            return Ok(());
                        },
                    )
@@ -764,7 +784,7 @@ impl Tenant {
                        match res {
                            Ok(p) => Some(p),
                            Err(e) => {
-                                make_broken(&tenant_clone, anyhow::anyhow!(e));
+                                make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
                                return Ok(());
                            }
                        }
@@ -788,7 +808,7 @@ impl Tenant {
                    {
                        Ok(should_resume_deletion) => should_resume_deletion,
                        Err(err) => {
-                            make_broken(&tenant_clone, anyhow::anyhow!(err));
+                            make_broken(&tenant_clone, anyhow::anyhow!(err), BrokenVerbosity::Error);
                            return Ok(());
                        }
                    }
@@ -818,7 +838,7 @@ impl Tenant {
                    .await;

                    if let Err(e) = deleted {
-                        make_broken(&tenant_clone, anyhow::anyhow!(e));
+                        make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
                    }

                    return Ok(());
@@ -839,7 +859,7 @@ impl Tenant {
                        tenant_clone.activate(broker_client, None, &ctx);
                    }
                    Err(e) => {
-                        make_broken(&tenant_clone, anyhow::anyhow!(e));
+                        make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
                    }
                }

@@ -3042,8 +3062,13 @@ impl Tenant {
            }
        }

-        let (pgdata_zstd, tar_zst_size) =
-            import_datadir::create_tar_zst(pgdata_path, &temp_path).await?;
+        let (pgdata_zstd, tar_zst_size) = create_zst_tarball(pgdata_path, &temp_path).await?;
+        const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
+        if tar_zst_size > INITDB_TAR_ZST_WARN_LIMIT {
+            warn!(
+                "compressed {temp_path} size of {tar_zst_size} is above limit {INITDB_TAR_ZST_WARN_LIMIT}."
+            );
+        }

        pausable_failpoint!("before-initdb-upload");

@@ -3143,7 +3168,7 @@ impl Tenant {

            let buf_read =
                BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
-            import_datadir::extract_tar_zst(&pgdata_path, buf_read)
+            extract_zst_tarball(&pgdata_path, buf_read)
                .await
                .context("extract initdb tar")?;
        } else {
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -196,16 +196,17 @@ impl LocationConf {
    /// For use when attaching/re-attaching: update the generation stored in this
    /// structure.  If we were in a secondary state, promote to attached (posession
    /// of a fresh generation implies this).
-    pub(crate) fn attach_in_generation(&mut self, generation: Generation) {
+    pub(crate) fn attach_in_generation(&mut self, mode: AttachmentMode, generation: Generation) {
        match &mut self.mode {
            LocationMode::Attached(attach_conf) => {
                attach_conf.generation = generation;
+                attach_conf.attach_mode = mode;
            }
            LocationMode::Secondary(_) => {
                // We are promoted to attached by the control plane's re-attach response
                self.mode = LocationMode::Attached(AttachedLocationConfig {
                    generation,
-                    attach_mode: AttachmentMode::Single,
+                    attach_mode: mode,
                })
            }
        }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -111,6 +111,7 @@ async fn create_local_delete_mark(
    let _ = std::fs::OpenOptions::new()
        .write(true)
        .create(true)
+        .truncate(true)
        .open(&marker_path)
        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;

@@ -481,7 +482,6 @@ impl DeleteTenantFlow {
        let tenant_shard_id = tenant.tenant_shard_id;

        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
            TaskKind::TimelineDeletionWorker,
            Some(tenant_shard_id),
            None,
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,13 +2,13 @@
 //! page server.

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
-use futures::stream::StreamExt;
 use itertools::Itertools;
 use pageserver_api::key::Key;
-use pageserver_api::models::ShardParameters;
+use pageserver_api::models::{LocationConfigMode, ShardParameters};
 use pageserver_api::shard::{
    ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
 };
+use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::cmp::Ordering;
@@ -125,6 +125,46 @@ pub(crate) enum ShardSelector {
    Page(Key),
 }

+/// A convenience for use with the re_attach ControlPlaneClient function: rather
+/// than the serializable struct, we build this enum that encapsulates
+/// the invariant that attached tenants always have generations.
+///
+/// This represents the subset of a LocationConfig that we receive during re-attach.
+pub(crate) enum TenantStartupMode {
+    Attached((AttachmentMode, Generation)),
+    Secondary,
+}
+
+impl TenantStartupMode {
+    /// Return the generation & mode that should be used when starting
+    /// this tenant.
+    ///
+    /// If this returns None, the re-attach struct is in an invalid state and
+    /// should be ignored in the response.
+    fn from_reattach_tenant(rart: ReAttachResponseTenant) -> Option<Self> {
+        match (rart.mode, rart.gen) {
+            (LocationConfigMode::Detached, _) => None,
+            (LocationConfigMode::Secondary, _) => Some(Self::Secondary),
+            (LocationConfigMode::AttachedMulti, Some(g)) => {
+                Some(Self::Attached((AttachmentMode::Multi, Generation::new(g))))
+            }
+            (LocationConfigMode::AttachedSingle, Some(g)) => {
+                Some(Self::Attached((AttachmentMode::Single, Generation::new(g))))
+            }
+            (LocationConfigMode::AttachedStale, Some(g)) => {
+                Some(Self::Attached((AttachmentMode::Stale, Generation::new(g))))
+            }
+            _ => {
+                tracing::warn!(
+                    "Received invalid re-attach state for tenant {}: {rart:?}",
+                    rart.id
+                );
+                None
+            }
+        }
+    }
+}
+
 impl TenantsMap {
    /// Convenience function for typical usage, where we want to get a `Tenant` object, for
    /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
@@ -271,7 +311,7 @@ pub struct TenantManager {

 fn emergency_generations(
    tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
-) -> HashMap<TenantShardId, Generation> {
+) -> HashMap<TenantShardId, TenantStartupMode> {
    tenant_confs
        .iter()
        .filter_map(|(tid, lc)| {
@@ -279,12 +319,15 @@ fn emergency_generations(
                Ok(lc) => lc,
                Err(_) => return None,
            };
-            let gen = match &lc.mode {
-                LocationMode::Attached(alc) => Some(alc.generation),
-                LocationMode::Secondary(_) => None,
-            };
-
-            gen.map(|g| (*tid, g))
+            Some((
+                *tid,
+                match &lc.mode {
+                    LocationMode::Attached(alc) => {
+                        TenantStartupMode::Attached((alc.attach_mode, alc.generation))
+                    }
+                    LocationMode::Secondary(_) => TenantStartupMode::Secondary,
+                },
+            ))
        })
        .collect()
 }
@@ -294,7 +337,7 @@ async fn init_load_generations(
    tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
    resources: &TenantSharedResources,
    cancel: &CancellationToken,
-) -> anyhow::Result<Option<HashMap<TenantShardId, Generation>>> {
+) -> anyhow::Result<Option<HashMap<TenantShardId, TenantStartupMode>>> {
    let generations = if conf.control_plane_emergency_mode {
        error!(
            "Emergency mode!  Tenants will be attached unsafely using their last known generation"
@@ -304,7 +347,12 @@ async fn init_load_generations(
        info!("Calling control plane API to re-attach tenants");
        // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
        match client.re_attach(conf).await {
-            Ok(tenants) => tenants,
+            Ok(tenants) => tenants
+                .into_iter()
+                .flat_map(|(id, rart)| {
+                    TenantStartupMode::from_reattach_tenant(rart).map(|tsm| (id, tsm))
+                })
+                .collect(),
            Err(RetryForeverError::ShuttingDown) => {
                anyhow::bail!("Shut down while waiting for control plane re-attach response")
            }
@@ -322,9 +370,17 @@ async fn init_load_generations(
    // Must only do this if remote storage is enabled, otherwise deletion queue
    // is not running and channel push will fail.
    if resources.remote_storage.is_some() {
-        resources
-            .deletion_queue_client
-            .recover(generations.clone())?;
+        let attached_tenants = generations
+            .iter()
+            .flat_map(|(id, start_mode)| {
+                match start_mode {
+                    TenantStartupMode::Attached((_mode, generation)) => Some(generation),
+                    TenantStartupMode::Secondary => None,
+                }
+                .map(|gen| (*id, *gen))
+            })
+            .collect();
+        resources.deletion_queue_client.recover(attached_tenants)?;
    }

    Ok(Some(generations))
@@ -490,9 +546,8 @@ pub async fn init_tenant_mgr(
    // Scan local filesystem for attached tenants
    let tenant_configs = init_load_tenant_configs(conf).await?;

-    // Determine which tenants are to be attached
-    let tenant_generations =
-        init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
+    // Determine which tenants are to be secondary or attached, and in which generation
+    let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;

    tracing::info!(
        "Attaching {} tenants at startup, warming up {} at a time",
@@ -522,97 +577,102 @@ pub async fn init_tenant_mgr(
            }
        };

-        let generation = if let Some(generations) = &tenant_generations {
+        // FIXME: if we were attached, and get demoted to secondary on re-attach, we
+        // don't have a place to get a config.
+        // (https://github.com/neondatabase/neon/issues/5377)
+        const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig =
+            SecondaryLocationConfig { warm: true };
+
+        // Update the location config according to the re-attach response
+        if let Some(tenant_modes) = &tenant_modes {
            // We have a generation map: treat it as the authority for whether
            // this tenant is really attached.
-            if let Some(gen) = generations.get(&tenant_shard_id) {
-                if let LocationMode::Attached(attached) = &location_conf.mode {
-                    if attached.generation > *gen {
+            match tenant_modes.get(&tenant_shard_id) {
+                None => {
+                    info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
+                    if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
+                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                            "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
+                        );
+                    }
+
+                    // We deleted local content: move on to next tenant, don't try and spawn this one.
+                    continue;
+                }
+                Some(TenantStartupMode::Secondary) => {
+                    if !matches!(location_conf.mode, LocationMode::Secondary(_)) {
+                        location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF);
+                    }
+                }
+                Some(TenantStartupMode::Attached((attach_mode, generation))) => {
+                    let old_gen_higher = match &location_conf.mode {
+                        LocationMode::Attached(AttachedLocationConfig {
+                            generation: old_generation,
+                            attach_mode: _attach_mode,
+                        }) => {
+                            if old_generation > generation {
+                                Some(old_generation)
+                            } else {
+                                None
+                            }
+                        }
+                        _ => None,
+                    };
+                    if let Some(old_generation) = old_gen_higher {
                        tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                            "Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
-                            attached.generation
+                            "Control plane gave decreasing generation ({generation:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
+                            old_generation
                        );

                        // We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away
                        // local disk content: demote to secondary rather than detaching.
-                        tenants.insert(
-                            tenant_shard_id,
-                            TenantSlot::Secondary(SecondaryTenant::new(
-                                tenant_shard_id,
-                                location_conf.shard,
-                                location_conf.tenant_conf.clone(),
-                                &SecondaryLocationConfig { warm: false },
-                            )),
-                        );
+                        location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF);
+                    } else {
+                        location_conf.attach_in_generation(*attach_mode, *generation);
                    }
                }
-                *gen
-            } else {
-                match &location_conf.mode {
-                    LocationMode::Secondary(secondary_config) => {
-                        // We do not require the control plane's permission for secondary mode
-                        // tenants, because they do no remote writes and hence require no
-                        // generation number
-                        info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
-                        tenants.insert(
-                            tenant_shard_id,
-                            TenantSlot::Secondary(SecondaryTenant::new(
-                                tenant_shard_id,
-                                location_conf.shard,
-                                location_conf.tenant_conf,
-                                secondary_config,
-                            )),
-                        );
-                    }
-                    LocationMode::Attached(_) => {
-                        // TODO: augment re-attach API to enable the control plane to
-                        // instruct us about secondary attachments.  That way, instead of throwing
-                        // away local state, we can gracefully fall back to secondary here, if the control
-                        // plane tells us so.
-                        // (https://github.com/neondatabase/neon/issues/5377)
-                        info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
-                        if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
-                            error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                                "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
-                            );
-                        }
-                    }
-                };
-
-                continue;
            }
        } else {
            // Legacy mode: no generation information, any tenant present
            // on local disk may activate
            info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Starting tenant in legacy mode, no generation",);
-            Generation::none()
        };

        // Presence of a generation number implies attachment: attach the tenant
        // if it wasn't already, and apply the generation number.
-        location_conf.attach_in_generation(generation);
        Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;

        let shard_identity = location_conf.shard;
-        match tenant_spawn(
-            conf,
-            tenant_shard_id,
-            &tenant_dir_path,
-            resources.clone(),
-            AttachedTenantConf::try_from(location_conf)?,
-            shard_identity,
-            Some(init_order.clone()),
-            &TENANTS,
-            SpawnMode::Lazy,
-            &ctx,
-        ) {
-            Ok(tenant) => {
-                tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant));
+        let slot = match location_conf.mode {
+            LocationMode::Attached(attached_conf) => {
+                match tenant_spawn(
+                    conf,
+                    tenant_shard_id,
+                    &tenant_dir_path,
+                    resources.clone(),
+                    AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
+                    shard_identity,
+                    Some(init_order.clone()),
+                    &TENANTS,
+                    SpawnMode::Lazy,
+                    &ctx,
+                ) {
+                    Ok(tenant) => TenantSlot::Attached(tenant),
+                    Err(e) => {
+                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
+                        continue;
+                    }
+                }
            }
-            Err(e) => {
-                error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
-            }
-        }
+            LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new(
+                tenant_shard_id,
+                shard_identity,
+                location_conf.tenant_conf,
+                &secondary_conf,
+            )),
+        };
+
+        tenants.insert(tenant_shard_id, slot);
    }

    info!("Processed {} local tenants at startup", tenants.len());
@@ -633,7 +693,7 @@ pub async fn init_tenant_mgr(
 /// Wrapper for Tenant::spawn that checks invariants before running, and inserts
 /// a broken tenant in the map if Tenant::spawn fails.
 #[allow(clippy::too_many_arguments)]
-pub(crate) fn tenant_spawn(
+fn tenant_spawn(
    conf: &'static PageServerConf,
    tenant_shard_id: TenantShardId,
    tenant_path: &Utf8Path,
@@ -825,40 +885,6 @@ pub(crate) enum SetNewTenantConfigError {
    Other(anyhow::Error),
 }

-pub(crate) async fn set_new_tenant_config(
-    conf: &'static PageServerConf,
-    new_tenant_conf: TenantConfOpt,
-    tenant_id: TenantId,
-) -> Result<(), SetNewTenantConfigError> {
-    // Legacy API: does not support sharding
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
-    info!("configuring tenant {tenant_id}");
-    let tenant = get_tenant(tenant_shard_id, true)?;
-
-    if !tenant.tenant_shard_id().shard_count.is_unsharded() {
-        // Note that we use ShardParameters::default below.
-        return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
-            "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
-        )));
-    }
-
-    // This is a legacy API that only operates on attached tenants: the preferred
-    // API to use is the location_config/ endpoint, which lets the caller provide
-    // the full LocationConf.
-    let location_conf = LocationConf::attached_single(
-        new_tenant_conf.clone(),
-        tenant.generation,
-        &ShardParameters::default(),
-    );
-
-    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf)
-        .await
-        .map_err(SetNewTenantConfigError::Persist)?;
-    tenant.set_new_tenant_config(new_tenant_conf);
-    Ok(())
-}
-
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum UpsertLocationError {
    #[error("Bad config request: {0}")]
@@ -1661,19 +1687,7 @@ impl TenantManager {
        let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
            .await
            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
-        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
-            TaskKind::MgmtRequest,
-            None,
-            None,
-            "tenant_files_delete",
-            false,
-            async move {
-                fs::remove_dir_all(tmp_path.as_path())
-                    .await
-                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
-            },
-        );
+        self.spawn_background_purge(tmp_path);

        fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
            "failpoint"
@@ -1708,9 +1722,9 @@ impl TenantManager {
                    .layers
                    .read()
                    .await
-                    .resident_layers()
-                    .collect::<Vec<_>>()
-                    .await;
+                    .likely_resident_layers()
+                    .collect::<Vec<_>>();
+
                for layer in timeline_layers {
                    let relative_path = layer
                        .local_path()
@@ -1827,6 +1841,133 @@ impl TenantManager {

        shutdown_all_tenants0(self.tenants).await
    }
+
+    /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
+    /// the background, and thereby avoid blocking any API requests on this deletion completing.
+    fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) {
+        // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
+        // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
+        let task_tenant_id = None;
+
+        task_mgr::spawn(
+            TaskKind::MgmtRequest,
+            task_tenant_id,
+            None,
+            "tenant_files_delete",
+            false,
+            async move {
+                fs::remove_dir_all(tmp_path.as_path())
+                    .await
+                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
+            },
+        );
+    }
+
+    pub(crate) async fn detach_tenant(
+        &self,
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+        detach_ignored: bool,
+        deletion_queue_client: &DeletionQueueClient,
+    ) -> Result<(), TenantStateError> {
+        let tmp_path = self
+            .detach_tenant0(
+                conf,
+                &TENANTS,
+                tenant_shard_id,
+                detach_ignored,
+                deletion_queue_client,
+            )
+            .await?;
+        self.spawn_background_purge(tmp_path);
+
+        Ok(())
+    }
+
+    async fn detach_tenant0(
+        &self,
+        conf: &'static PageServerConf,
+        tenants: &std::sync::RwLock<TenantsMap>,
+        tenant_shard_id: TenantShardId,
+        detach_ignored: bool,
+        deletion_queue_client: &DeletionQueueClient,
+    ) -> Result<Utf8PathBuf, TenantStateError> {
+        let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
+            let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
+            safe_rename_tenant_dir(&local_tenant_directory)
+                .await
+                .with_context(|| {
+                    format!("local tenant directory {local_tenant_directory:?} rename")
+                })
+        };
+
+        let removal_result = remove_tenant_from_memory(
+            tenants,
+            tenant_shard_id,
+            tenant_dir_rename_operation(tenant_shard_id),
+        )
+        .await;
+
+        // Flush pending deletions, so that they have a good chance of passing validation
+        // before this tenant is potentially re-attached elsewhere.
+        deletion_queue_client.flush_advisory();
+
+        // Ignored tenants are not present in memory and will bail the removal from memory operation.
+        // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
+        if detach_ignored
+            && matches!(
+                removal_result,
+                Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
+            )
+        {
+            let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
+            if tenant_ignore_mark.exists() {
+                info!("Detaching an ignored tenant");
+                let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
+                    .await
+                    .with_context(|| {
+                        format!("Ignored tenant {tenant_shard_id} local directory rename")
+                    })?;
+                return Ok(tmp_path);
+            }
+        }
+
+        removal_result
+    }
+
+    pub(crate) async fn set_new_tenant_config(
+        &self,
+        new_tenant_conf: TenantConfOpt,
+        tenant_id: TenantId,
+    ) -> Result<(), SetNewTenantConfigError> {
+        // Legacy API: does not support sharding
+        let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+        info!("configuring tenant {tenant_id}");
+        let tenant = get_tenant(tenant_shard_id, true)?;
+
+        if !tenant.tenant_shard_id().shard_count.is_unsharded() {
+            // Note that we use ShardParameters::default below.
+            return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
+            "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
+        )));
+        }
+
+        // This is a legacy API that only operates on attached tenants: the preferred
+        // API to use is the location_config/ endpoint, which lets the caller provide
+        // the full LocationConf.
+        let location_conf = LocationConf::attached_single(
+            new_tenant_conf.clone(),
+            tenant.generation,
+            &ShardParameters::default(),
+        );
+
+        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &location_conf)
+            .await
+            .map_err(SetNewTenantConfigError::Persist)?;
+        tenant.set_new_tenant_config(new_tenant_conf);
+        Ok(())
+    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -2028,87 +2169,6 @@ pub(crate) enum TenantStateError {
    Other(#[from] anyhow::Error),
 }

-pub(crate) async fn detach_tenant(
-    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
-    detach_ignored: bool,
-    deletion_queue_client: &DeletionQueueClient,
-) -> Result<(), TenantStateError> {
-    let tmp_path = detach_tenant0(
-        conf,
-        &TENANTS,
-        tenant_shard_id,
-        detach_ignored,
-        deletion_queue_client,
-    )
-    .await?;
-    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
-    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
-    let task_tenant_id = None;
-    task_mgr::spawn(
-        task_mgr::BACKGROUND_RUNTIME.handle(),
-        TaskKind::MgmtRequest,
-        task_tenant_id,
-        None,
-        "tenant_files_delete",
-        false,
-        async move {
-            fs::remove_dir_all(tmp_path.as_path())
-                .await
-                .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
-        },
-    );
-    Ok(())
-}
-
-async fn detach_tenant0(
-    conf: &'static PageServerConf,
-    tenants: &std::sync::RwLock<TenantsMap>,
-    tenant_shard_id: TenantShardId,
-    detach_ignored: bool,
-    deletion_queue_client: &DeletionQueueClient,
-) -> Result<Utf8PathBuf, TenantStateError> {
-    let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
-        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
-        safe_rename_tenant_dir(&local_tenant_directory)
-            .await
-            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))
-    };
-
-    let removal_result = remove_tenant_from_memory(
-        tenants,
-        tenant_shard_id,
-        tenant_dir_rename_operation(tenant_shard_id),
-    )
-    .await;
-
-    // Flush pending deletions, so that they have a good chance of passing validation
-    // before this tenant is potentially re-attached elsewhere.
-    deletion_queue_client.flush_advisory();
-
-    // Ignored tenants are not present in memory and will bail the removal from memory operation.
-    // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
-    if detach_ignored
-        && matches!(
-            removal_result,
-            Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
-        )
-    {
-        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
-        if tenant_ignore_mark.exists() {
-            info!("Detaching an ignored tenant");
-            let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
-                .await
-                .with_context(|| {
-                    format!("Ignored tenant {tenant_shard_id} local directory rename")
-                })?;
-            return Ok(tmp_path);
-        }
-    }
-
-    removal_result
-}
-
 pub(crate) async fn load_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
@@ -2142,7 +2202,7 @@ pub(crate) async fn load_tenant(

    let mut location_conf =
        Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?;
-    location_conf.attach_in_generation(generation);
+    location_conf.attach_in_generation(AttachmentMode::Single, generation);

    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;

@@ -2729,7 +2789,7 @@ use {
    utils::http::error::ApiError,
 };

-pub(crate) async fn immediate_gc(
+pub(crate) fn immediate_gc(
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    gc_req: TimelineGcRequest,
@@ -2751,30 +2811,28 @@ pub(crate) async fn immediate_gc(
    // Run in task_mgr to avoid race with tenant_detach operation
    let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
    let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
+    let span = info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
+
    // TODO: spawning is redundant now, need to hold the gate
    task_mgr::spawn(
-        &tokio::runtime::Handle::current(),
        TaskKind::GarbageCollector,
        Some(tenant_shard_id),
        Some(timeline_id),
        &format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"),
        false,
        async move {
-            fail::fail_point!("immediate_gc_task_pre");
-
            #[allow(unused_mut)]
            let mut result = tenant
                .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
-                .instrument(info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))
                .await;
                // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
                // better once the types support it.

            #[cfg(feature = "testing")]
            {
+                // we need to synchronize with drop completion for python tests without polling for
+                // log messages
                if let Ok(result) = result.as_mut() {
-                    // why not futures unordered? it seems it needs very much the same task structure
-                    // but would only run on single task.
                    let mut js = tokio::task::JoinSet::new();
                    for layer in std::mem::take(&mut result.doomed_layers) {
                        js.spawn(layer.wait_drop());
@@ -2790,7 +2848,7 @@ pub(crate) async fn immediate_gc(

                if let Some(rtc) = rtc {
                    // layer drops schedule actions on remote timeline client to actually do the
-                    // deletions; don't care just exit fast about the shutdown error
+                    // deletions; don't care about the shutdown error, just exit fast
                    drop(rtc.wait_completion().await);
                }
            }
@@ -2801,6 +2859,7 @@ pub(crate) async fn immediate_gc(
            }
            Ok(())
        }
+        .instrument(span)
    );

    // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -223,7 +223,6 @@ use crate::{
    config::PageServerConf,
    task_mgr,
    task_mgr::TaskKind,
-    task_mgr::BACKGROUND_RUNTIME,
    tenant::metadata::TimelineMetadata,
    tenant::upload_queue::{
        UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
@@ -307,8 +306,6 @@ pub enum PersistIndexPartWithDeletedFlagError {
 pub struct RemoteTimelineClient {
    conf: &'static PageServerConf,

-    runtime: tokio::runtime::Handle,
-
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    generation: Generation,
@@ -341,12 +338,6 @@ impl RemoteTimelineClient {
    ) -> RemoteTimelineClient {
        RemoteTimelineClient {
            conf,
-            runtime: if cfg!(test) {
-                // remote_timeline_client.rs tests rely on current-thread runtime
-                tokio::runtime::Handle::current()
-            } else {
-                BACKGROUND_RUNTIME.handle().clone()
-            },
            tenant_shard_id,
            timeline_id,
            generation,
@@ -1281,7 +1272,6 @@ impl RemoteTimelineClient {
            let tenant_shard_id = self.tenant_shard_id;
            let timeline_id = self.timeline_id;
            task_mgr::spawn(
-                &self.runtime,
                TaskKind::RemoteUploadTask,
                Some(self.tenant_shard_id),
                Some(self.timeline_id),
@@ -1876,7 +1866,6 @@ mod tests {
        fn build_client(&self, generation: Generation) -> Arc<RemoteTimelineClient> {
            Arc::new(RemoteTimelineClient {
                conf: self.harness.conf,
-                runtime: tokio::runtime::Handle::current(),
                tenant_shard_id: self.harness.tenant_shard_id,
                timeline_id: TIMELINE_ID,
                generation,
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -8,7 +8,7 @@ use std::{sync::Arc, time::SystemTime};
 use crate::{
    config::PageServerConf,
    disk_usage_eviction_task::DiskUsageEvictionInfo,
-    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
+    task_mgr::{self, TaskKind},
    virtual_file::MaybeFatalIo,
 };

@@ -317,7 +317,6 @@ pub fn spawn_tasks(
        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);

    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
        TaskKind::SecondaryDownloads,
        None,
        None,
@@ -338,7 +337,6 @@ pub fn spawn_tasks(
    );

    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
        TaskKind::SecondaryUploads,
        None,
        None,
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -15,6 +15,7 @@ use crate::{
    tenant::{
        config::SecondaryLocationConfig,
        debug_assert_current_span_has_tenant_and_timeline_id,
+        ephemeral_file::is_ephemeral_file,
        remote_timeline_client::{
            index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
@@ -534,7 +535,11 @@ impl<'a> TenantDownloader<'a> {
            .await
            .maybe_fatal_err(&context_msg)?;

-        tracing::debug!("Wrote local heatmap to {}", heatmap_path);
+        tracing::debug!(
+            "Wrote local heatmap to {}, with {} timelines",
+            heatmap_path,
+            heatmap.timelines.len()
+        );

        // Clean up any local layers that aren't in the heatmap.  We do this first for all timelines, on the general
        // principle that deletions should be done before writes wherever possible, and so that we can use this
@@ -547,6 +552,10 @@ impl<'a> TenantDownloader<'a> {
        // Download the layers in the heatmap
        for timeline in heatmap.timelines {
            if self.secondary_state.cancel.is_cancelled() {
+                tracing::debug!(
+                    "Cancelled before downloading timeline {}",
+                    timeline.timeline_id
+                );
                return Ok(());
            }

@@ -764,10 +773,13 @@ impl<'a> TenantDownloader<'a> {
            }
        };

+        tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
+
        // Download heatmap layers that are not present on local disk, or update their
        // access time if they are already present.
        for layer in timeline.layers {
            if self.secondary_state.cancel.is_cancelled() {
+                tracing::debug!("Cancelled -- dropping out of layer loop");
                return Ok(());
            }

@@ -950,7 +962,10 @@ async fn init_timeline_state(
            // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
            warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
            continue;
-        } else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) {
+        } else if crate::is_temporary(&file_path)
+            || is_temp_download_file(&file_path)
+            || is_ephemeral_file(file_name)
+        {
            // Temporary files are frequently left behind from restarting during downloads
            tracing::info!("Cleaning up temporary file {file_path}");
            if let Err(e) = tokio::fs::remove_file(&file_path)
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -300,6 +300,7 @@ where

        let tenant_shard_id = job.get_tenant_shard_id();
        let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
+            tracing::info!("Command already running, waiting for it");
            barrier
        } else {
            let running = self.spawn_now(job);
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
--- a/pageserver/src/tenant/storage_layer/layer/failpoints.rs
+++ b/pageserver/src/tenant/storage_layer/layer/failpoints.rs
@@ -0,0 +1,119 @@
+//! failpoints for unit tests, implying `#[cfg(test)]`.
+//!
+//! These are not accessible over http.
+
+use super::*;
+
+impl Layer {
+    /// Enable a failpoint from a unit test.
+    pub(super) fn enable_failpoint(&self, failpoint: Failpoint) {
+        self.0.failpoints.lock().unwrap().push(failpoint);
+    }
+}
+
+impl LayerInner {
+    /// Query if this failpoint is enabled, as in, arrive at a failpoint.
+    ///
+    /// Calls to this method need to be `#[cfg(test)]` guarded.
+    pub(super) async fn failpoint(&self, kind: FailpointKind) -> Result<(), FailpointHit> {
+        let fut = {
+            let mut fps = self.failpoints.lock().unwrap();
+            // find the *last* failpoint for cases in which we need to use multiple for the same
+            // thing (two blocked evictions)
+            let fp = fps.iter_mut().rfind(|x| x.kind() == kind);
+
+            let Some(fp) = fp else {
+                return Ok(());
+            };
+
+            fp.hit()
+        };
+
+        fut.await
+    }
+}
+
+#[derive(Debug, PartialEq, Eq)]
+pub(crate) enum FailpointKind {
+    /// Failpoint acts as an accurate cancelled by drop here; see the only site of use.
+    AfterDeterminingLayerNeedsNoDownload,
+    /// Failpoint for stalling eviction starting
+    WaitBeforeStartingEvicting,
+    /// Failpoint hit in the spawned task
+    WaitBeforeDownloading,
+}
+
+pub(crate) enum Failpoint {
+    AfterDeterminingLayerNeedsNoDownload,
+    WaitBeforeStartingEvicting(
+        Option<utils::completion::Completion>,
+        utils::completion::Barrier,
+    ),
+    WaitBeforeDownloading(
+        Option<utils::completion::Completion>,
+        utils::completion::Barrier,
+    ),
+}
+
+impl Failpoint {
+    fn kind(&self) -> FailpointKind {
+        match self {
+            Failpoint::AfterDeterminingLayerNeedsNoDownload => {
+                FailpointKind::AfterDeterminingLayerNeedsNoDownload
+            }
+            Failpoint::WaitBeforeStartingEvicting(..) => FailpointKind::WaitBeforeStartingEvicting,
+            Failpoint::WaitBeforeDownloading(..) => FailpointKind::WaitBeforeDownloading,
+        }
+    }
+
+    fn hit(&mut self) -> impl std::future::Future<Output = Result<(), FailpointHit>> + 'static {
+        use futures::future::FutureExt;
+
+        // use boxed futures to avoid Either hurdles
+        match self {
+            Failpoint::AfterDeterminingLayerNeedsNoDownload => {
+                let kind = self.kind();
+
+                async move { Err(FailpointHit(kind)) }.boxed()
+            }
+            Failpoint::WaitBeforeStartingEvicting(arrival, b)
+            | Failpoint::WaitBeforeDownloading(arrival, b) => {
+                // first one signals arrival
+                drop(arrival.take());
+
+                let b = b.clone();
+
+                async move {
+                    tracing::trace!("waiting on a failpoint barrier");
+                    b.wait().await;
+                    tracing::trace!("done waiting on a failpoint barrier");
+                    Ok(())
+                }
+                .boxed()
+            }
+        }
+    }
+}
+
+impl std::fmt::Display for FailpointKind {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        std::fmt::Debug::fmt(self, f)
+    }
+}
+
+#[derive(Debug)]
+pub(crate) struct FailpointHit(FailpointKind);
+
+impl std::fmt::Display for FailpointHit {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        std::fmt::Debug::fmt(self, f)
+    }
+}
+
+impl std::error::Error for FailpointHit {}
+
+impl From<FailpointHit> for DownloadError {
+    fn from(value: FailpointHit) -> Self {
+        DownloadError::Failpoint(value.0)
+    }
+}
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1,14 +1,13 @@
-use futures::StreamExt;
 use pageserver_api::key::CONTROLFILE_KEY;
 use tokio::task::JoinSet;
-use tracing::Instrument;
 use utils::{
    completion::{self, Completion},
    id::TimelineId,
 };

+use super::failpoints::{Failpoint, FailpointKind};
 use super::*;
-use crate::{context::DownloadBehavior, task_mgr::BACKGROUND_RUNTIME};
+use crate::context::DownloadBehavior;
 use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};

 /// Used in tests to advance a future to wanted await point, and not futher.
@@ -21,7 +20,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s
 /// Demonstrate the API and resident -> evicted -> resident -> deleted transitions.
 #[tokio::test]
 async fn smoke_test() {
-    let handle = BACKGROUND_RUNTIME.handle();
+    let handle = tokio::runtime::Handle::current();

    let h = TenantHarness::create("smoke_test").unwrap();
    let span = h.span();
@@ -38,7 +37,7 @@ async fn smoke_test() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.resident_layers().collect::<Vec<_>>().await
+            layers.likely_resident_layers().collect::<Vec<_>>()
        };

        assert_eq!(layers.len(), 1);
@@ -88,7 +87,7 @@ async fn smoke_test() {
    //
    // ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to
    // artificially slow it down.
-    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
+    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(&handle).await;

    match layer
        .evict_and_wait(std::time::Duration::ZERO)
@@ -99,7 +98,7 @@ async fn smoke_test() {
            // expected, but note that the eviction is "still ongoing"
            helper.release().await;
            // exhaust spawn_blocking pool to ensure it is now complete
-            SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle)
+            SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle)
                .await;
        }
        other => unreachable!("{other:?}"),
@@ -108,7 +107,7 @@ async fn smoke_test() {
    // only way to query if a layer is resident is to acquire a ResidentLayer instance.
    // Layer::keep_resident never downloads, but it might initialize if the layer file is found
    // downloaded locally.
-    let none = layer.keep_resident().await.unwrap();
+    let none = layer.keep_resident().await;
    assert!(
        none.is_none(),
        "Expected none, because eviction removed the local file, found: {none:?}"
@@ -167,6 +166,7 @@ async fn smoke_test() {
    rtc.wait_completion().await.unwrap();

    assert_eq!(rtc.get_remote_physical_size(), 0);
+    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
 }

 /// This test demonstrates a previous hang when a eviction and deletion were requested at the same
@@ -174,7 +174,7 @@ async fn smoke_test() {
 #[tokio::test(start_paused = true)]
 async fn evict_and_wait_on_wanted_deleted() {
    // this is the runtime on which Layer spawns the blocking tasks on
-    let handle = BACKGROUND_RUNTIME.handle();
+    let handle = tokio::runtime::Handle::current();

    let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap();
    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
@@ -188,7 +188,7 @@ async fn evict_and_wait_on_wanted_deleted() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.resident_layers().collect::<Vec<_>>().await
+            layers.likely_resident_layers().collect::<Vec<_>>()
        };

        assert_eq!(layers.len(), 1);
@@ -213,11 +213,11 @@ async fn evict_and_wait_on_wanted_deleted() {
        drop(resident);

        // make sure the eviction task gets to run
-        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
+        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;

        let resident = layer.keep_resident().await;
        assert!(
-            matches!(resident, Ok(None)),
+            resident.is_none(),
            "keep_resident should not have re-initialized: {resident:?}"
        );

@@ -235,24 +235,332 @@ async fn evict_and_wait_on_wanted_deleted() {
        layers.finish_gc_timeline(&[layer]);
    }

-    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
+    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;

    assert_eq!(1, LAYER_IMPL_METRICS.started_deletes.get());
    assert_eq!(1, LAYER_IMPL_METRICS.completed_deletes.get());
    assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
    assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
+    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
 }

-/// This test shows that ensures we are able to read the layer while the layer eviction has been
-/// started but not completed due to spawn_blocking pool being blocked.
-///
-/// Here `Layer::keep_resident` is used to "simulate" reads, because it cannot download.
-#[tokio::test(start_paused = true)]
-async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
-    // this is the runtime on which Layer spawns the blocking tasks on
-    let handle = BACKGROUND_RUNTIME.handle();
-    let h = TenantHarness::create("residency_check_while_evict_and_wait_on_clogged_spawn_blocking")
+/// This test ensures we are able to read the layer while the layer eviction has been
+/// started but not completed.
+#[test]
+fn read_wins_pending_eviction() {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .max_blocking_threads(1)
+        .enable_all()
+        .start_paused(true)
+        .build()
        .unwrap();
+
+    rt.block_on(async move {
+        // this is the runtime on which Layer spawns the blocking tasks on
+        let handle = tokio::runtime::Handle::current();
+        let h = TenantHarness::create("read_wins_pending_eviction").unwrap();
+        let (tenant, ctx) = h.load().await;
+        let span = h.span();
+        let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
+
+        let timeline = tenant
+            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+            .await
+            .unwrap();
+
+        let layer = {
+            let mut layers = {
+                let layers = timeline.layers.read().await;
+                layers.likely_resident_layers().collect::<Vec<_>>()
+            };
+
+            assert_eq!(layers.len(), 1);
+
+            layers.swap_remove(0)
+        };
+
+        // setup done
+
+        let resident = layer.keep_resident().await.unwrap();
+
+        let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
+
+        // drive the future to await on the status channel
+        tokio::time::timeout(ADVANCE, &mut evict_and_wait)
+            .await
+            .expect_err("should had been a timeout since we are holding the layer resident");
+        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+
+        let (completion, barrier) = utils::completion::channel();
+        let (arrival, arrived_at_barrier) = utils::completion::channel();
+        layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
+            Some(arrival),
+            barrier,
+        ));
+
+        // now the eviction cannot proceed because the threads are consumed while completion exists
+        drop(resident);
+        arrived_at_barrier.wait().await;
+        assert!(!layer.is_likely_resident());
+
+        // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
+        layer
+            .0
+            .get_or_maybe_download(false, None)
+            .instrument(download_span)
+            .await
+            .expect("should had reinitialized without downloading");
+
+        assert!(layer.is_likely_resident());
+
+        // reinitialization notifies of new resident status, which should error out all evict_and_wait
+        let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
+            .await
+            .expect("no timeout, because get_or_maybe_download re-initialized")
+            .expect_err("eviction should not have succeeded because re-initialized");
+
+        // works as intended: evictions lose to "downloads"
+        assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
+        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+
+        // this is not wrong: the eviction is technically still "on the way" as it's still queued
+        // because of a failpoint
+        assert_eq!(
+            0,
+            LAYER_IMPL_METRICS
+                .cancelled_evictions
+                .values()
+                .map(|ctr| ctr.get())
+                .sum::<u64>()
+        );
+
+        drop(completion);
+
+        tokio::time::sleep(ADVANCE).await;
+        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1)
+            .await;
+
+        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+
+        // now we finally can observe the original eviction failing
+        // it would had been possible to observe it earlier, but here it is guaranteed to have
+        // happened.
+        assert_eq!(
+            1,
+            LAYER_IMPL_METRICS
+                .cancelled_evictions
+                .values()
+                .map(|ctr| ctr.get())
+                .sum::<u64>()
+        );
+
+        assert_eq!(
+            1,
+            LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::AlreadyReinitialized].get()
+        );
+
+        assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
+    });
+}
+
+/// Use failpoint to delay an eviction starting to get a VersionCheckFailed.
+#[test]
+fn multiple_pending_evictions_in_order() {
+    let name = "multiple_pending_evictions_in_order";
+    let in_order = true;
+    multiple_pending_evictions_scenario(name, in_order);
+}
+
+/// Use failpoint to reorder later eviction before first to get a UnexpectedEvictedState.
+#[test]
+fn multiple_pending_evictions_out_of_order() {
+    let name = "multiple_pending_evictions_out_of_order";
+    let in_order = false;
+    multiple_pending_evictions_scenario(name, in_order);
+}
+
+fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .max_blocking_threads(1)
+        .enable_all()
+        .start_paused(true)
+        .build()
+        .unwrap();
+
+    rt.block_on(async move {
+        // this is the runtime on which Layer spawns the blocking tasks on
+        let handle = tokio::runtime::Handle::current();
+        let h = TenantHarness::create(name).unwrap();
+        let (tenant, ctx) = h.load().await;
+        let span = h.span();
+        let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
+
+        let timeline = tenant
+            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+            .await
+            .unwrap();
+
+        let layer = {
+            let mut layers = {
+                let layers = timeline.layers.read().await;
+                layers.likely_resident_layers().collect::<Vec<_>>()
+            };
+
+            assert_eq!(layers.len(), 1);
+
+            layers.swap_remove(0)
+        };
+
+        // setup done
+
+        let resident = layer.keep_resident().await.unwrap();
+
+        let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
+
+        // drive the future to await on the status channel
+        tokio::time::timeout(ADVANCE, &mut evict_and_wait)
+            .await
+            .expect_err("should had been a timeout since we are holding the layer resident");
+        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+
+        let (completion1, barrier) = utils::completion::channel();
+        let mut completion1 = Some(completion1);
+        let (arrival, arrived_at_barrier) = utils::completion::channel();
+        layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
+            Some(arrival),
+            barrier,
+        ));
+
+        // now the eviction cannot proceed because we are simulating arbitrary long delay for the
+        // eviction task start.
+        drop(resident);
+        assert!(!layer.is_likely_resident());
+
+        arrived_at_barrier.wait().await;
+
+        // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
+        layer
+            .0
+            .get_or_maybe_download(false, None)
+            .instrument(download_span)
+            .await
+            .expect("should had reinitialized without downloading");
+
+        assert!(layer.is_likely_resident());
+
+        // reinitialization notifies of new resident status, which should error out all evict_and_wait
+        let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
+            .await
+            .expect("no timeout, because get_or_maybe_download re-initialized")
+            .expect_err("eviction should not have succeeded because re-initialized");
+
+        // works as intended: evictions lose to "downloads"
+        assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
+        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+
+        // this is not wrong: the eviction is technically still "on the way" as it's still queued
+        // because of a failpoint
+        assert_eq!(
+            0,
+            LAYER_IMPL_METRICS
+                .cancelled_evictions
+                .values()
+                .map(|ctr| ctr.get())
+                .sum::<u64>()
+        );
+
+        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+
+        // configure another failpoint for the second eviction -- evictions are per initialization,
+        // so now that we've reinitialized the inner, we get to run two of them at the same time.
+        let (completion2, barrier) = utils::completion::channel();
+        let (arrival, arrived_at_barrier) = utils::completion::channel();
+        layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
+            Some(arrival),
+            barrier,
+        ));
+
+        let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
+
+        // advance to the wait on the queue
+        tokio::time::timeout(ADVANCE, &mut second_eviction)
+            .await
+            .expect_err("timeout because failpoint is blocking");
+
+        arrived_at_barrier.wait().await;
+
+        assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());
+
+        let mut release_earlier_eviction = |expected_reason| {
+            assert_eq!(
+                0,
+                LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
+            );
+
+            drop(completion1.take().unwrap());
+
+            let handle = &handle;
+
+            async move {
+                tokio::time::sleep(ADVANCE).await;
+                SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(
+                    handle, 1,
+                )
+                .await;
+
+                assert_eq!(
+                    1,
+                    LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
+                );
+            }
+        };
+
+        if in_order {
+            release_earlier_eviction(EvictionCancelled::VersionCheckFailed).await;
+        }
+
+        // release the later eviction which is for the current version
+        drop(completion2);
+        tokio::time::sleep(ADVANCE).await;
+        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1)
+            .await;
+
+        if !in_order {
+            release_earlier_eviction(EvictionCancelled::UnexpectedEvictedState).await;
+        }
+
+        tokio::time::timeout(ADVANCE, &mut second_eviction)
+            .await
+            .expect("eviction goes through now that spawn_blocking is unclogged")
+            .expect("eviction should succeed, because version matches");
+
+        assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
+
+        // ensure the cancelled are unchanged
+        assert_eq!(
+            1,
+            LAYER_IMPL_METRICS
+                .cancelled_evictions
+                .values()
+                .map(|ctr| ctr.get())
+                .sum::<u64>()
+        );
+
+        assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
+    });
+}
+
+/// The test ensures with a failpoint that a pending eviction is not cancelled by what is currently
+/// a `Layer::keep_resident` call.
+///
+/// This matters because cancelling the eviction would leave us in a state where the file is on
+/// disk but the layer internal state says it has not been initialized. Futhermore, it allows us to
+/// have non-repairing `Layer::is_likely_resident`.
+#[tokio::test(start_paused = true)]
+async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
+    let handle = tokio::runtime::Handle::current();
+    let h =
+        TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap();
    let (tenant, ctx) = h.load().await;

    let timeline = tenant
@@ -263,7 +571,7 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.resident_layers().collect::<Vec<_>>().await
+            layers.likely_resident_layers().collect::<Vec<_>>()
        };

        assert_eq!(layers.len(), 1);
@@ -271,90 +579,154 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
        layers.swap_remove(0)
    };

-    // setup done
+    // this failpoint will simulate the `get_or_maybe_download` becoming cancelled (by returning an
+    // Err) at the right time as in "during" the `LayerInner::needs_download`.
+    layer.enable_failpoint(Failpoint::AfterDeterminingLayerNeedsNoDownload);

-    let resident = layer.keep_resident().await.unwrap();
+    let (completion, barrier) = utils::completion::channel();
+    let (arrival, arrived_at_barrier) = utils::completion::channel();
+
+    layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
+        Some(arrival),
+        barrier,
+    ));
+
+    tokio::time::timeout(ADVANCE, layer.evict_and_wait(FOREVER))
+        .await
+        .expect_err("should had advanced to waiting on channel");
+
+    arrived_at_barrier.wait().await;
+
+    // simulate a cancelled read which is cancelled before it gets to re-initialize
+    let e = layer
+        .0
+        .get_or_maybe_download(false, None)
+        .await
+        .unwrap_err();
+    assert!(
+        matches!(
+            e,
+            DownloadError::Failpoint(FailpointKind::AfterDeterminingLayerNeedsNoDownload)
+        ),
+        "{e:?}"
+    );
+
+    assert!(
+        layer.0.needs_download().await.unwrap().is_none(),
+        "file is still on disk"
+    );
+
+    // release the eviction task
+    drop(completion);
+    tokio::time::sleep(ADVANCE).await;
+    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
+
+    // failpoint is still enabled, but it is not hit
+    let e = layer
+        .0
+        .get_or_maybe_download(false, None)
+        .await
+        .unwrap_err();
+    assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}");
+
+    // failpoint is not counted as cancellation either
+    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
+}
+
+#[tokio::test(start_paused = true)]
+async fn evict_and_wait_does_not_wait_for_download() {
+    // let handle = tokio::runtime::Handle::current();
+    let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap();
+    let (tenant, ctx) = h.load().await;
+    let span = h.span();
+    let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
+
+    let timeline = tenant
+        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .await
+        .unwrap();
+
+    let layer = {
+        let mut layers = {
+            let layers = timeline.layers.read().await;
+            layers.likely_resident_layers().collect::<Vec<_>>()
+        };
+
+        assert_eq!(layers.len(), 1);
+
+        layers.swap_remove(0)
+    };
+
+    // kind of forced setup: start an eviction but do not allow it progress until we are
+    // downloading
+    let (eviction_can_continue, barrier) = utils::completion::channel();
+    let (arrival, eviction_arrived) = utils::completion::channel();
+    layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
+        Some(arrival),
+        barrier,
+    ));

    let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));

-    // drive the future to await on the status channel
+    // use this once-awaited other_evict to synchronize with the eviction
+    let other_evict = layer.evict_and_wait(FOREVER);
+
    tokio::time::timeout(ADVANCE, &mut evict_and_wait)
        .await
-        .expect_err("should had been a timeout since we are holding the layer resident");
-    assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+        .expect_err("should had advanced");
+    eviction_arrived.wait().await;
+    drop(eviction_can_continue);
+    other_evict.await.unwrap();

-    // clog up BACKGROUND_RUNTIME spawn_blocking
-    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
+    // now the layer is evicted, and the "evict_and_wait" is waiting on the receiver
+    assert!(!layer.is_likely_resident());

-    // now the eviction cannot proceed because the threads are consumed while completion exists
-    drop(resident);
+    // following new evict_and_wait will fail until we've completed the download
+    let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
+    assert!(matches!(e, EvictionError::NotFound), "{e:?}");

-    // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
-    layer
-        .keep_resident()
-        .await
-        .expect("keep_resident should had reinitialized without downloading")
-        .expect("ResidentLayer");
+    let (download_can_continue, barrier) = utils::completion::channel();
+    let (arrival, _download_arrived) = utils::completion::channel();
+    layer.enable_failpoint(Failpoint::WaitBeforeDownloading(Some(arrival), barrier));

-    // because the keep_resident check alters wanted evicted without sending a message, we will
-    // never get completed
-    let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
-        .await
-        .expect("no timeout, because keep_resident re-initialized")
-        .expect_err("eviction should not have succeeded because re-initialized");
+    let mut download = std::pin::pin!(layer
+        .0
+        .get_or_maybe_download(true, None)
+        .instrument(download_span));

-    // works as intended: evictions lose to "downloads"
-    assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
-    assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
-
-    // this is not wrong: the eviction is technically still "on the way" as it's still queued
-    // because spawn_blocking is clogged up
-    assert_eq!(
-        0,
-        LAYER_IMPL_METRICS
-            .cancelled_evictions
-            .values()
-            .map(|ctr| ctr.get())
-            .sum::<u64>()
+    assert!(
+        !layer.is_likely_resident(),
+        "during download layer is evicted"
    );

-    let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
-
-    // advance to the wait on the queue
-    tokio::time::timeout(ADVANCE, &mut second_eviction)
+    tokio::time::timeout(ADVANCE, &mut download)
        .await
-        .expect_err("timeout because spawn_blocking is clogged");
+        .expect_err("should had timed out because of failpoint");

-    // in this case we don't leak started evictions, but I think there is still a chance of that
-    // happening, because we could have upgrades race multiple evictions while only one of them
-    // happens?
-    assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());
+    // now we finally get to continue, and because the latest state is downloading, we deduce that
+    // original eviction succeeded
+    evict_and_wait.await.unwrap();

-    helper.release().await;
+    // however a new evict_and_wait will fail
+    let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
+    assert!(matches!(e, EvictionError::NotFound), "{e:?}");

-    // the second_eviction gets to run here
-    //
-    // synchronize to be *strictly* after the second_eviction spawn_blocking run
-    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
+    assert!(!layer.is_likely_resident());

-    tokio::time::timeout(ADVANCE, &mut second_eviction)
-        .await
-        .expect("eviction goes through now that spawn_blocking is unclogged")
-        .expect("eviction should succeed, because version matches");
+    drop(download_can_continue);
+    download.await.expect("download should had succeeded");
+    assert!(layer.is_likely_resident());

-    assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
+    // only now can we evict
+    layer.evict_and_wait(FOREVER).await.unwrap();
+}

-    // now we finally can observe the original spawn_blocking failing
-    // it would had been possible to observe it earlier, but here it is guaranteed to have
-    // happened.
-    assert_eq!(
-        1,
-        LAYER_IMPL_METRICS
-            .cancelled_evictions
-            .values()
-            .map(|ctr| ctr.get())
-            .sum::<u64>()
-    );
+#[test]
+fn layer_size() {
+    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
+    assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
+    assert_eq!(std::mem::size_of::<LayerInner>(), 2328);
+    // it also has the utf8 path
 }

 struct SpawnBlockingPoolHelper {
@@ -371,31 +743,41 @@ impl SpawnBlockingPoolHelper {
    ///
    /// This should be no issue nowdays, because nextest runs each test in it's own process.
    async fn consume_all_spawn_blocking_threads(handle: &tokio::runtime::Handle) -> Self {
-        let (completion, barrier) = completion::channel();
-        let (tx, mut rx) = tokio::sync::mpsc::channel(8);
+        let default_max_blocking_threads = 512;

-        let assumed_max_blocking_threads = 512;
+        Self::consume_all_spawn_blocking_threads0(handle, default_max_blocking_threads).await
+    }
+
+    async fn consume_all_spawn_blocking_threads0(
+        handle: &tokio::runtime::Handle,
+        threads: usize,
+    ) -> Self {
+        assert_ne!(threads, 0);
+
+        let (completion, barrier) = completion::channel();
+        let (started, starts_completed) = completion::channel();

        let mut blocking_tasks = JoinSet::new();

-        for _ in 0..assumed_max_blocking_threads {
+        for _ in 0..threads {
            let barrier = barrier.clone();
-            let tx = tx.clone();
+            let started = started.clone();
            blocking_tasks.spawn_blocking_on(
                move || {
-                    tx.blocking_send(()).unwrap();
-                    drop(tx);
+                    drop(started);
                    tokio::runtime::Handle::current().block_on(barrier.wait());
                },
                handle,
            );
        }

+        drop(started);
+
+        starts_completed.wait().await;
+
        drop(barrier);

-        for _ in 0..assumed_max_blocking_threads {
-            rx.recv().await.unwrap();
-        }
+        tracing::trace!("consumed all threads");

        SpawnBlockingPoolHelper {
            awaited_by_spawn_blocking_tasks: completion,
@@ -415,13 +797,22 @@ impl SpawnBlockingPoolHelper {
        while let Some(res) = blocking_tasks.join_next().await {
            res.expect("none of the tasks should had panicked");
        }
+
+        tracing::trace!("released all threads");
    }

    /// In the tests it is used as an easy way of making sure something scheduled on the target
    /// runtimes `spawn_blocking` has completed, because it must've been scheduled and completed
    /// before our tasks have a chance to schedule and complete.
    async fn consume_and_release_all_of_spawn_blocking_threads(handle: &tokio::runtime::Handle) {
-        Self::consume_all_spawn_blocking_threads(handle)
+        Self::consume_and_release_all_of_spawn_blocking_threads0(handle, 512).await
+    }
+
+    async fn consume_and_release_all_of_spawn_blocking_threads0(
+        handle: &tokio::runtime::Handle,
+        threads: usize,
+    ) {
+        Self::consume_all_spawn_blocking_threads0(handle, threads)
            .await
            .release()
            .await
@@ -435,7 +826,7 @@ fn spawn_blocking_pool_helper_actually_works() {
    // because the amount is not configurable for our helper, expect the same amount as
    // BACKGROUND_RUNTIME using the tokio defaults would have.
    let rt = tokio::runtime::Builder::new_current_thread()
-        .max_blocking_threads(512)
+        .max_blocking_threads(1)
        .enable_all()
        .build()
        .unwrap();
@@ -445,7 +836,8 @@ fn spawn_blocking_pool_helper_actually_works() {
    rt.block_on(async move {
        // this will not return until all threads are spun up and actually executing the code
        // waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d.
-        let consumed = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
+        let consumed =
+            SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads0(handle, 1).await;

        println!("consumed");

--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -8,7 +8,7 @@ use std::time::{Duration, Instant};
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
-use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
+use crate::task_mgr::TaskKind;
 use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
@@ -18,7 +18,7 @@ use utils::{backoff, completion};

 static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
    once_cell::sync::Lazy::new(|| {
-        let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
+        let total_threads = *crate::task_mgr::THE_RUNTIME_WORKER_THREADS;
        let permits = usize::max(
            1,
            // while a lot of the work is done on spawn_blocking, we still do
@@ -85,7 +85,6 @@ pub fn start_background_loops(
 ) {
    let tenant_shard_id = tenant.tenant_shard_id;
    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
        TaskKind::Compaction,
        Some(tenant_shard_id),
        None,
@@ -109,7 +108,6 @@ pub fn start_background_loops(
        },
    );
    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
        TaskKind::GarbageCollector,
        Some(tenant_shard_id),
        None,
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -13,7 +13,6 @@ use bytes::Bytes;
 use camino::Utf8Path;
 use enumset::EnumSet;
 use fail::fail_point;
-use futures::stream::StreamExt;
 use once_cell::sync::Lazy;
 use pageserver_api::{
    key::AUX_FILES_KEY,
@@ -37,6 +36,7 @@ use tracing::*;
 use utils::{
    bin_ser::BeSer,
    sync::gate::{Gate, GateGuard},
+    vec_map::VecMap,
 };

 use std::ops::{Deref, Range};
@@ -1723,7 +1723,6 @@ impl Timeline {
            initdb_optimization_count: 0,
        };
        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::LayerFlushTask,
            Some(self.tenant_shard_id),
            Some(self.timeline_id),
@@ -2086,7 +2085,6 @@ impl Timeline {
            DownloadBehavior::Download,
        );
        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::InitialLogicalSizeCalculation,
            Some(self.tenant_shard_id),
            Some(self.timeline_id),
@@ -2264,7 +2262,6 @@ impl Timeline {
            DownloadBehavior::Download,
        );
        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::OndemandLogicalSizeCalculation,
            Some(self.tenant_shard_id),
            Some(self.timeline_id),
@@ -2442,7 +2439,7 @@ impl Timeline {

        let guard = self.layers.read().await;

-        let resident = guard.resident_layers().map(|layer| {
+        let resident = guard.likely_resident_layers().map(|layer| {
            let last_activity_ts = layer.access_stats().latest_activity_or_now();

            HeatMapLayer::new(
@@ -2452,7 +2449,7 @@ impl Timeline {
            )
        });

-        let layers = resident.collect().await;
+        let layers = resident.collect();

        Some(HeatMapTimeline::new(self.timeline_id, layers))
    }
@@ -3840,7 +3837,7 @@ impl Timeline {
        };
        let timer = self.metrics.garbage_collect_histo.start_timer();

-        fail_point!("before-timeline-gc");
+        pausable_failpoint!("before-timeline-gc");

        // Is the timeline being deleted?
        if self.is_stopping() {
@@ -4151,7 +4148,6 @@ impl Timeline {

        let self_clone = Arc::clone(&self);
        let task_id = task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
            task_mgr::TaskKind::DownloadAllRemoteLayers,
            Some(self.tenant_shard_id),
            Some(self.timeline_id),
@@ -4302,7 +4298,7 @@ impl Timeline {
        let mut max_layer_size: Option<u64> = None;

        let resident_layers = guard
-            .resident_layers()
+            .likely_resident_layers()
            .map(|layer| {
                let file_size = layer.layer_desc().file_size;
                max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
@@ -4315,8 +4311,7 @@ impl Timeline {
                    relative_last_activity: finite_f32::FiniteF32::ZERO,
                }
            })
-            .collect()
-            .await;
+            .collect();

        DiskUsageEvictionInfo {
            max_layer_size,
@@ -4618,16 +4613,15 @@ impl<'a> TimelineWriter<'a> {
        }
    }

-    /// Put a batch keys at the specified Lsns.
+    /// Put a batch of keys at the specified Lsns.
    ///
-    /// The batch should be sorted by Lsn such that it's safe
-    /// to roll the open layer mid batch.
+    /// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`].
    pub(crate) async fn put_batch(
        &mut self,
-        batch: Vec<(Key, Lsn, Value)>,
+        batch: VecMap<Lsn, (Key, Value)>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        for (key, lsn, val) in batch {
+        for (lsn, (key, val)) in batch {
            self.put(key, lsn, &val, ctx).await?
        }

@@ -4713,7 +4707,6 @@ mod tests {
            .keep_resident()
            .await
            .expect("no download => no downloading errors")
-            .expect("should had been resident")
            .drop_eviction_guard();

        let forever = std::time::Duration::from_secs(120);
@@ -4724,7 +4717,7 @@ mod tests {
        let (first, second) = tokio::join!(first, second);

        let res = layer.keep_resident().await;
-        assert!(matches!(res, Ok(None)), "{res:?}");
+        assert!(res.is_none(), "{res:?}");

        match (first, second) {
            (Ok(()), Ok(())) => {
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -443,7 +443,6 @@ impl DeleteTimelineFlow {
        let timeline_id = timeline.timeline_id;

        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
            TaskKind::TimelineDeletionWorker,
            Some(tenant_shard_id),
            Some(timeline_id),
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -28,7 +28,7 @@ use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
 use crate::{
    context::{DownloadBehavior, RequestContext},
    pgdatadir_mapping::CollectKeySpaceError,
-    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
+    task_mgr::{self, TaskKind},
    tenant::{
        tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
    },
@@ -56,7 +56,6 @@ impl Timeline {
        let self_clone = Arc::clone(self);
        let background_tasks_can_start = background_tasks_can_start.cloned();
        task_mgr::spawn(
-            BACKGROUND_RUNTIME.handle(),
            TaskKind::Eviction,
            Some(self.tenant_shard_id),
            Some(self.timeline_id),
@@ -225,24 +224,18 @@ impl Timeline {
        {
            let guard = self.layers.read().await;
            let layers = guard.layer_map();
-            for hist_layer in layers.iter_historic_layers() {
-                let hist_layer = guard.get_from_desc(&hist_layer);
+            for layer in layers.iter_historic_layers() {
+                let layer = guard.get_from_desc(&layer);

                // guard against eviction while we inspect it; it might be that eviction_task and
                // disk_usage_eviction_task both select the same layers to be evicted, and
                // seemingly free up double the space. both succeeding is of no consequence.
-                let guard = match hist_layer.keep_resident().await {
-                    Ok(Some(l)) => l,
-                    Ok(None) => continue,
-                    Err(e) => {
-                        // these should not happen, but we cannot make them statically impossible right
-                        // now.
-                        tracing::warn!(layer=%hist_layer, "failed to keep the layer resident: {e:#}");
-                        continue;
-                    }
-                };

-                let last_activity_ts = hist_layer.access_stats().latest_activity_or_now();
+                if !layer.is_likely_resident() {
+                    continue;
+                }
+
+                let last_activity_ts = layer.access_stats().latest_activity_or_now();

                let no_activity_for = match now.duration_since(last_activity_ts) {
                    Ok(d) => d,
@@ -265,9 +258,8 @@ impl Timeline {
                        continue;
                    }
                };
-                let layer = guard.drop_eviction_guard();
+
                if no_activity_for > p.threshold {
-                    // this could cause a lot of allocations in some cases
                    js.spawn(async move {
                        layer
                            .evict_and_wait(std::time::Duration::from_secs(5))
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -1,5 +1,4 @@
 use anyhow::{bail, ensure, Context, Result};
-use futures::StreamExt;
 use pageserver_api::shard::TenantShardId;
 use std::{collections::HashMap, sync::Arc};
 use tracing::trace;
@@ -241,29 +240,16 @@ impl LayerManager {
        layer.delete_on_drop();
    }

-    pub(crate) fn resident_layers(&self) -> impl futures::stream::Stream<Item = Layer> + '_ {
+    pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = Layer> + '_ {
        // for small layer maps, we most likely have all resident, but for larger more are likely
        // to be evicted assuming lots of layers correlated with longer lifespan.

-        let layers = self
-            .layer_map()
-            .iter_historic_layers()
-            .map(|desc| self.get_from_desc(&desc));
-
-        let layers = futures::stream::iter(layers);
-
-        layers.filter_map(|layer| async move {
-            // TODO(#6028): this query does not really need to see the ResidentLayer
-            match layer.keep_resident().await {
-                Ok(Some(layer)) => Some(layer.drop_eviction_guard()),
-                Ok(None) => None,
-                Err(e) => {
-                    // these should not happen, but we cannot make them statically impossible right
-                    // now.
-                    tracing::warn!(%layer, "failed to keep the layer resident: {e:#}");
-                    None
-                }
-            }
+        self.layer_map().iter_historic_layers().filter_map(|desc| {
+            self.layer_fmgr
+                .0
+                .get(&desc.key())
+                .filter(|l| l.is_likely_resident())
+                .cloned()
        })
    }

--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -24,7 +24,7 @@ mod connection_manager;
 mod walreceiver_connection;

 use crate::context::{DownloadBehavior, RequestContext};
-use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
+use crate::task_mgr::{self, TaskKind};
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::timeline::walreceiver::connection_manager::{
    connection_manager_loop_step, ConnectionManagerState,
@@ -82,7 +82,6 @@ impl WalReceiver {
        let loop_status = Arc::new(std::sync::RwLock::new(None));
        let manager_status = Arc::clone(&loop_status);
        task_mgr::spawn(
-            WALRECEIVER_RUNTIME.handle(),
            TaskKind::WalReceiverManager,
            Some(timeline.tenant_shard_id),
            Some(timeline_id),
@@ -181,7 +180,7 @@ impl<E: Clone> TaskHandle<E> {
        let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started);

        let cancellation_clone = cancellation.clone();
-        let join_handle = WALRECEIVER_RUNTIME.spawn(async move {
+        let join_handle = tokio::spawn(async move {
            events_sender.send(TaskStateUpdate::Started).ok();
            task(events_sender, cancellation_clone).await
            // events_sender is dropped at some point during the .await above.
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -11,7 +11,6 @@ use std::{
 use anyhow::{anyhow, Context};
 use bytes::BytesMut;
 use chrono::{NaiveDateTime, Utc};
-use fail::fail_point;
 use futures::StreamExt;
 use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow};
 use postgres_ffi::WAL_SEGMENT_SIZE;
@@ -27,9 +26,7 @@ use super::TaskStateUpdate;
 use crate::{
    context::RequestContext,
    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
-    task_mgr,
-    task_mgr::TaskKind,
-    task_mgr::WALRECEIVER_RUNTIME,
+    task_mgr::{self, TaskKind},
    tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
    walingest::WalIngest,
    walrecord::DecodedWALRecord,
@@ -163,7 +160,6 @@ pub(super) async fn handle_walreceiver_connection(
    );
    let connection_cancellation = cancellation.clone();
    task_mgr::spawn(
-        WALRECEIVER_RUNTIME.handle(),
        TaskKind::WalReceiverConnectionPoller,
        Some(timeline.tenant_shard_id),
        Some(timeline.timeline_id),
@@ -329,7 +325,17 @@ pub(super) async fn handle_walreceiver_connection(
                            filtered_records += 1;
                        }

-                        fail_point!("walreceiver-after-ingest");
+                        // don't simply use pausable_failpoint here because its spawn_blocking slows
+                        // slows down the tests too much.
+                        fail::fail_point!("walreceiver-after-ingest-blocking");
+                        if let Err(()) = (|| {
+                            fail::fail_point!("walreceiver-after-ingest-pause-activate", |_| {
+                                Err(())
+                            });
+                            Ok(())
+                        })() {
+                            pausable_failpoint!("walreceiver-after-ingest-pause");
+                        }

                        last_rec_lsn = lsn;

@@ -448,6 +454,7 @@ pub(super) async fn handle_walreceiver_connection(
                disk_consistent_lsn,
                remote_consistent_lsn,
                replytime: ts,
+                shard_number: timeline.tenant_shard_id.shard_number.0 as u32,
            };

            debug!("neon_status_update {status_update:?}");
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -109,6 +109,8 @@ impl WalIngest {
            self.checkpoint_modified = true;
        }

+        failpoint_support::sleep_millis_async!("wal-ingest-record-sleep");
+
        match decoded.xl_rmid {
            pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
                // Heap AM records need some special handling, because they modify VM pages
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -312,7 +312,7 @@ pg_cluster_size(PG_FUNCTION_ARGS)
 {
 	int64		size;

-	size = GetZenithCurrentClusterSize();
+	size = GetNeonCurrentClusterSize();

 	if (size == 0)
 		PG_RETURN_NULL();
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -26,6 +26,8 @@ extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);

 extern uint64 BackpressureThrottlingTime(void);
+extern void SetNeonCurrentClusterSize(uint64 size);
+extern uint64 GetNeonCurrentClusterSize(void);
 extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);

 extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1831,7 +1831,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT &&
 		!IsAutoVacuumWorkerProcess())
 	{
-		uint64		current_size = GetZenithCurrentClusterSize();
+		uint64		current_size = GetNeonCurrentClusterSize();

 		if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
 			ereport(ERROR,
@@ -1912,7 +1912,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 		reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT &&
 		!IsAutoVacuumWorkerProcess())
 	{
-		uint64		current_size = GetZenithCurrentClusterSize();
+		uint64		current_size = GetNeonCurrentClusterSize();

 		if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
 			ereport(ERROR,
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -70,7 +70,7 @@ static bool SendAppendRequests(Safekeeper *sk);
 static bool RecvAppendResponses(Safekeeper *sk);
 static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp);
 static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp);
-static void HandleSafekeeperResponse(WalProposer *wp);
+static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk);
 static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size);
 static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg);
 static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state);
@@ -1405,7 +1405,6 @@ static bool
 RecvAppendResponses(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
-	XLogRecPtr	newCommitLsn;
 	bool		readAnything = false;

 	while (true)
@@ -1425,6 +1424,8 @@ RecvAppendResponses(Safekeeper *sk)
 			   LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
 			   sk->host, sk->port);

+		readAnything = true;
+
 		if (sk->appendResponse.term > wp->propTerm)
 		{
 			/*
@@ -1438,35 +1439,28 @@ RecvAppendResponses(Safekeeper *sk)
 				   sk->appendResponse.term, wp->propTerm);
 		}

-		readAnything = true;
+		HandleSafekeeperResponse(wp, sk);
 	}

 	if (!readAnything)
 		return sk->state == SS_ACTIVE;

-	/* update commit_lsn */
-	newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp);
-	/*
-	 * Send the new value to all safekeepers.
-	 */
-	if (newCommitLsn > wp->commitLsn)
-	{
-		wp->commitLsn = newCommitLsn;
-		BroadcastAppendRequest(wp);
-	}
-
-	HandleSafekeeperResponse(wp);
-
 	return sk->state == SS_ACTIVE;
 }

+#define psfeedback_log(fmt, key, ...) \
+	wp_log(DEBUG2, "ParsePageserverFeedbackMessage: %s " fmt, key, __VA_ARGS__)
+
 /* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */
 static void
-ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *rf)
+ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *ps_feedback)
 {
 	uint8		nkeys;
 	int			i;
-	int32		len;
+
+	/* initialize the struct before parsing */
+	memset(ps_feedback, 0, sizeof(PageserverFeedback));
+	ps_feedback->present = true;

 	/* get number of custom keys */
 	nkeys = pq_getmsgbyte(reply_message);
@@ -1474,66 +1468,52 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 	for (i = 0; i < nkeys; i++)
 	{
 		const char *key = pq_getmsgstring(reply_message);
+		unsigned int value_len = pq_getmsgint(reply_message, sizeof(int32));

 		if (strcmp(key, "current_timeline_size") == 0)
 		{
-			pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-			rf->currentClusterSize = pq_getmsgint64(reply_message);
-			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
-				   rf->currentClusterSize);
+			Assert(value_len == sizeof(int64));
+			ps_feedback->currentClusterSize = pq_getmsgint64(reply_message);
+			psfeedback_log(UINT64_FORMAT, key, ps_feedback->currentClusterSize);
 		}
 		else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
 		{
-			pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-			rf->last_received_lsn = pq_getmsgint64(reply_message);
-			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
-				   LSN_FORMAT_ARGS(rf->last_received_lsn));
+			Assert(value_len == sizeof(int64));
+			ps_feedback->last_received_lsn = pq_getmsgint64(reply_message);
+			psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->last_received_lsn));
 		}
 		else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
 		{
-			pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-			rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
-			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
-				   LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
+			Assert(value_len == sizeof(int64));
+			ps_feedback->disk_consistent_lsn = pq_getmsgint64(reply_message);
+			psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->disk_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
 		{
-			pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-			rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
-			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
-				   LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
+			Assert(value_len == sizeof(int64));
+			ps_feedback->remote_consistent_lsn = pq_getmsgint64(reply_message);
+			psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->remote_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
 		{
-			pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-			rf->replytime = pq_getmsgint64(reply_message);
-			{
-				char	   *replyTimeStr;
-
-				/* Copy because timestamptz_to_str returns a static buffer */
-				replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
-				wp_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
-					   rf->replytime, replyTimeStr);
-
-				pfree(replyTimeStr);
-			}
+			Assert(value_len == sizeof(int64));
+			ps_feedback->replytime = pq_getmsgint64(reply_message);
+			psfeedback_log("%s", key, timestamptz_to_str(ps_feedback->replytime));
+		}
+		else if (strcmp(key, "shard_number") == 0)
+		{
+			Assert(value_len == sizeof(uint32));
+			ps_feedback->shard_number = pq_getmsgint(reply_message, sizeof(uint32));
+			psfeedback_log("%u", key, ps_feedback->shard_number);
 		}
 		else
 		{
-			len = pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-
 			/*
 			 * Skip unknown keys to support backward compatibile protocol
 			 * changes
 			 */
-			wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
-			pq_getmsgbytes(reply_message, len);
+			wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, value_len);
+			pq_getmsgbytes(reply_message, value_len);
 		};
 	}
 }
@@ -1630,12 +1610,30 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
 	return donor;
 }

+/*
+ * Process AppendResponse message from safekeeper.
+ */
 static void
-HandleSafekeeperResponse(WalProposer *wp)
+HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk)
 {
 	XLogRecPtr	candidateTruncateLsn;
+	XLogRecPtr  newCommitLsn;

-	wp->api.process_safekeeper_feedback(wp);
+	newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp);
+	if (newCommitLsn > wp->commitLsn)
+	{
+		wp->commitLsn = newCommitLsn;
+		/* Send new value to all safekeepers. */
+		BroadcastAppendRequest(wp);
+	}
+
+	/* 
+	 * Unlock syncrep waiters, update ps_feedback, CheckGracefulShutdown().
+	 * The last one will terminate the process if the shutdown is requested
+	 * and WAL is committed by the quorum. BroadcastAppendRequest() should be
+	 * called to notify safekeepers about the new commitLsn.
+	 */
+	wp->api.process_safekeeper_feedback(wp, sk);

 	/*
 	 * Try to advance truncateLsn -- the last record flushed to all
@@ -1811,8 +1809,10 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 				msg->hs.ts = pq_getmsgint64_le(&s);
 				msg->hs.xmin.value = pq_getmsgint64_le(&s);
 				msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
-				if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE)
-					ParsePageserverFeedbackMessage(wp, &s, &msg->rf);
+				if (s.len > s.cursor)
+					ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
+				else
+					msg->ps_feedback.present = false;
 				pq_getmsgend(&s);
 				return true;
 			}
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -10,6 +10,7 @@

 #include "libpqwalproposer.h"
 #include "neon_walreader.h"
+#include "pagestore_client.h"

 #define SK_MAGIC 0xCafeCeefu
 #define SK_PROTOCOL_VERSION 2
@@ -269,6 +270,8 @@ typedef struct HotStandbyFeedback

 typedef struct PageserverFeedback
 {
+	/* true if AppendResponse contains this feedback */
+	bool		present;
 	/* current size of the timeline on pageserver */
 	uint64		currentClusterSize;
 	/* standby_status_update fields that safekeeper received from pageserver */
@@ -276,14 +279,22 @@ typedef struct PageserverFeedback
 	XLogRecPtr	disk_consistent_lsn;
 	XLogRecPtr	remote_consistent_lsn;
 	TimestampTz replytime;
+	uint32		shard_number;
 } PageserverFeedback;

 typedef struct WalproposerShmemState
 {
 	slock_t		mutex;
-	PageserverFeedback feedback;
 	term_t		mineLastElectedTerm;
 	pg_atomic_uint64 backpressureThrottlingTime;
+	pg_atomic_uint64 currentClusterSize;
+
+	/* last feedback from each shard */
+	PageserverFeedback shard_ps_feedback[MAX_SHARDS];
+	int num_shards;
+
+	/* aggregated feedback with min LSNs across shards */
+	PageserverFeedback min_ps_feedback;
 } WalproposerShmemState;

 /*
@@ -307,12 +318,12 @@ typedef struct AppendResponse
 	/* Feedback received from pageserver includes standby_status_update fields */
 	/* and custom neon feedback. */
 	/* This part of the message is extensible. */
-	PageserverFeedback rf;
+	PageserverFeedback ps_feedback;
 } AppendResponse;

 /*  PageserverFeedback is extensible part of the message that is parsed separately */
 /*  Other fields are fixed part */
-#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf)
+#define APPENDRESPONSE_FIXEDPART_SIZE 56

 struct WalProposer;
 typedef struct WalProposer WalProposer;
@@ -560,11 +571,11 @@ typedef struct walproposer_api
 	void		(*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn);

 	/*
-	 * Called after every new message from the safekeeper. Used to propagate
+	 * Called after every AppendResponse from the safekeeper. Used to propagate
 	 * backpressure feedback and to confirm WAL persistence (has been commited
 	 * on the quorum of safekeepers).
 	 */
-	void		(*process_safekeeper_feedback) (WalProposer *wp);
+	void		(*process_safekeeper_feedback) (WalProposer *wp, Safekeeper *sk);

 	/*
 	 * Write a log message to the internal log processor. This is used only
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -63,7 +63,6 @@ char	   *wal_acceptors_list = "";
 int			wal_acceptor_reconnect_timeout = 1000;
 int			wal_acceptor_connection_timeout = 10000;

-static AppendResponse quorumFeedback;
 static WalproposerShmemState *walprop_shared;
 static WalProposerConfig walprop_config;
 static XLogRecPtr sentPtr = InvalidXLogRecPtr;
@@ -71,6 +70,10 @@ static const walproposer_api walprop_pg;
 static volatile sig_atomic_t got_SIGUSR2 = false;
 static bool reported_sigusr2 = false;

+static XLogRecPtr standby_flush_lsn = InvalidXLogRecPtr;
+static XLogRecPtr standby_apply_lsn = InvalidXLogRecPtr;
+static HotStandbyFeedback agg_hs_feedback;
+
 static void nwp_shmem_startup_hook(void);
 static void nwp_register_gucs(void);
 static void nwp_prepare_shmem(void);
@@ -279,6 +282,7 @@ WalproposerShmemInit(void)
 		memset(walprop_shared, 0, WalproposerShmemSize());
 		SpinLockInit(&walprop_shared->mutex);
 		pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
+		pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0);
 	}
 	LWLockRelease(AddinShmemInitLock);

@@ -402,21 +406,58 @@ walprop_pg_get_shmem_state(WalProposer *wp)
 	return walprop_shared;
 }

-static void
-replication_feedback_set(PageserverFeedback *rf)
+/*
+ * Record new ps_feedback in the array with shards and update min_feedback.
+ */
+static PageserverFeedback
+record_pageserver_feedback(PageserverFeedback *ps_feedback)
 {
+	PageserverFeedback min_feedback;
+
+	Assert(ps_feedback->present);
+	Assert(ps_feedback->shard_number < MAX_SHARDS);
+
 	SpinLockAcquire(&walprop_shared->mutex);
-	memcpy(&walprop_shared->feedback, rf, sizeof(PageserverFeedback));
+
+	/* Update the number of shards */
+	if (ps_feedback->shard_number + 1 > walprop_shared->num_shards)
+		walprop_shared->num_shards = ps_feedback->shard_number + 1;
+
+	/* Update the feedback */
+	memcpy(&walprop_shared->shard_ps_feedback[ps_feedback->shard_number], ps_feedback, sizeof(PageserverFeedback));
+
+	/* Calculate min LSNs */
+	memcpy(&min_feedback, ps_feedback, sizeof(PageserverFeedback));
+	for (int i = 0; i < walprop_shared->num_shards; i++)
+	{
+		PageserverFeedback *feedback = &walprop_shared->shard_ps_feedback[i];
+		if (feedback->present)
+		{
+			if (min_feedback.last_received_lsn == InvalidXLogRecPtr || feedback->last_received_lsn < min_feedback.last_received_lsn)
+				min_feedback.last_received_lsn = feedback->last_received_lsn;
+			
+			if (min_feedback.disk_consistent_lsn == InvalidXLogRecPtr || feedback->disk_consistent_lsn < min_feedback.disk_consistent_lsn)
+				min_feedback.disk_consistent_lsn = feedback->disk_consistent_lsn;
+			
+			if (min_feedback.remote_consistent_lsn == InvalidXLogRecPtr || feedback->remote_consistent_lsn < min_feedback.remote_consistent_lsn)
+				min_feedback.remote_consistent_lsn = feedback->remote_consistent_lsn;
+		}
+	}
+	/* Copy min_feedback back to shmem */
+	memcpy(&walprop_shared->min_ps_feedback, &min_feedback, sizeof(PageserverFeedback));
+
 	SpinLockRelease(&walprop_shared->mutex);
+
+	return min_feedback;
 }

 void
 replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn)
 {
 	SpinLockAcquire(&walprop_shared->mutex);
-	*writeLsn = walprop_shared->feedback.last_received_lsn;
-	*flushLsn = walprop_shared->feedback.disk_consistent_lsn;
-	*applyLsn = walprop_shared->feedback.remote_consistent_lsn;
+	*writeLsn = walprop_shared->min_ps_feedback.last_received_lsn;
+	*flushLsn = walprop_shared->min_ps_feedback.disk_consistent_lsn;
+	*applyLsn = walprop_shared->min_ps_feedback.remote_consistent_lsn;
 	SpinLockRelease(&walprop_shared->mutex);
 }

@@ -509,9 +550,10 @@ walprop_pg_init_standalone_sync_safekeepers(void)
 static void
 walprop_sigusr2(SIGNAL_ARGS)
 {
+	int			save_errno = errno;
 	got_SIGUSR2 = true;
-
 	SetLatch(MyLatch);
+	errno = save_errno;
 }

 static void
@@ -1869,39 +1911,6 @@ CheckGracefulShutdown(WalProposer *wp)
 	}
 }

-/*
- * Choose most advanced PageserverFeedback and set it to *rf.
- */
-static void
-GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
-{
-	int			latest_safekeeper = 0;
-	XLogRecPtr	last_received_lsn = InvalidXLogRecPtr;
-
-	for (int i = 0; i < wp->n_safekeepers; i++)
-	{
-		if (wp->safekeeper[i].appendResponse.rf.last_received_lsn > last_received_lsn)
-		{
-			latest_safekeeper = i;
-			last_received_lsn = wp->safekeeper[i].appendResponse.rf.last_received_lsn;
-		}
-	}
-
-	rf->currentClusterSize = wp->safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize;
-	rf->last_received_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.last_received_lsn;
-	rf->disk_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.disk_consistent_lsn;
-	rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn;
-	rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime;
-
-	wpg_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
-			" last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
-			rf->currentClusterSize,
-			LSN_FORMAT_ARGS(rf->last_received_lsn),
-			LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
-			LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
-			rf->replytime);
-}
-
 /*
 * Combine hot standby feedbacks from all safekeepers.
 */
@@ -1949,26 +1958,38 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
 * None of that is functional in sync-safekeepers.
 */
 static void
-walprop_pg_process_safekeeper_feedback(WalProposer *wp)
+walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
 {
-	HotStandbyFeedback hsFeedback;
-	XLogRecPtr	oldDiskConsistentLsn;
+	HotStandbyFeedback	hsFeedback;
+	bool				needToAdvanceSlot = false;

 	if (wp->config->syncSafekeepers)
 		return;

-	oldDiskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;
-
-	/* Get PageserverFeedback fields from the most advanced safekeeper */
-	GetLatestNeonFeedback(&quorumFeedback.rf, wp);
-	replication_feedback_set(&quorumFeedback.rf);
-	SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
-
-	if (wp->commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
+	/* handle fresh ps_feedback */
+	if (sk->appendResponse.ps_feedback.present)
 	{
-		if (wp->commitLsn > quorumFeedback.flushLsn)
-			quorumFeedback.flushLsn = wp->commitLsn;
+		PageserverFeedback min_feedback = record_pageserver_feedback(&sk->appendResponse.ps_feedback);

+		/* Only one main shard sends non-zero currentClusterSize */
+		if (sk->appendResponse.ps_feedback.currentClusterSize > 0)
+			SetNeonCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize);
+
+		if (min_feedback.disk_consistent_lsn != standby_apply_lsn)
+		{
+			standby_apply_lsn = min_feedback.disk_consistent_lsn;
+			needToAdvanceSlot = true;
+		}
+	}
+
+	if (wp->commitLsn > standby_flush_lsn)
+	{
+		standby_flush_lsn = wp->commitLsn;
+		needToAdvanceSlot = true;
+	}
+
+	if (needToAdvanceSlot)
+	{
 		/*
 		 * Advance the replication slot to commitLsn. WAL before it is
 		 * hardened and will be fetched from one of safekeepers by
@@ -1977,23 +1998,23 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp)
 		 * Also wakes up syncrep waiters.
 		 */
 		ProcessStandbyReply(
-		/* write_lsn -  This is what durably stored in WAL service. */
-							quorumFeedback.flushLsn,
-		/* flush_lsn - This is what durably stored in WAL service. */
-							quorumFeedback.flushLsn,
+		/* write_lsn -  This is what durably stored in safekeepers quorum. */
+							standby_flush_lsn,
+		/* flush_lsn - This is what durably stored in safekeepers quorum. */
+							standby_flush_lsn,

 		/*
 		 * apply_lsn - This is what processed and durably saved at*
 		 * pageserver.
 		 */
-							quorumFeedback.rf.disk_consistent_lsn,
+							standby_apply_lsn,
 							walprop_pg_get_current_timestamp(wp), false);
 	}

 	CombineHotStanbyFeedbacks(&hsFeedback, wp);
-	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0)
+	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
 	{
-		quorumFeedback.hs = hsFeedback;
+		agg_hs_feedback = hsFeedback;
 		ProcessStandbyHSFeedback(hsFeedback.ts,
 								 XidFromFullTransactionId(hsFeedback.xmin),
 								 EpochFromFullTransactionId(hsFeedback.xmin),
@@ -2074,6 +2095,18 @@ GetLogRepRestartLSN(WalProposer *wp)
 	return lrRestartLsn;
 }

+void SetNeonCurrentClusterSize(uint64 size)
+{
+	pg_atomic_write_u64(&walprop_shared->currentClusterSize, size);
+}
+
+uint64 GetNeonCurrentClusterSize(void)
+{
+	return pg_atomic_read_u64(&walprop_shared->currentClusterSize);
+}
+uint64 GetNeonCurrentClusterSize(void);
+
+
 static const walproposer_api walprop_pg = {
 	.get_shmem_state = walprop_pg_get_shmem_state,
 	.start_streaming = walprop_pg_start_streaming,
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -11,6 +11,10 @@ testing = []
 [dependencies]
 anyhow.workspace = true
 async-trait.workspace = true
+aws-config.workspace = true
+aws-sdk-iam.workspace = true
+aws-sigv4.workspace = true
+aws-types.workspace = true
 base64.workspace = true
 bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
@@ -27,6 +31,7 @@ hashlink.workspace = true
 hex.workspace = true
 hmac.workspace = true
 hostname.workspace = true
+http.workspace = true
 humantime.workspace = true
 hyper-tungstenite.workspace = true
 hyper.workspace = true
@@ -59,10 +64,11 @@ rustls.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-sha2.workspace = true
+sha2 = { workspace = true, features = ["asm"] }
 smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
+subtle.workspace = true
 sync_wrapper.workspace = true
 task-local-extensions.workspace = true
 thiserror.workspace = true
@@ -91,6 +97,7 @@ workspace_hack.workspace = true

 [dev-dependencies]
 camino-tempfile.workspace = true
+fallible-iterator.workspace = true
 rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -254,7 +254,7 @@ async fn authenticate_with_secret(
    config: &'static AuthenticationConfig,
 ) -> auth::Result<ComputeCredentials> {
    if let Some(password) = unauthenticated_password {
-        let auth_outcome = validate_password_and_exchange(&password, secret)?;
+        let auth_outcome = validate_password_and_exchange(&password, secret).await?;
        let keys = match auth_outcome {
            crate::sasl::Outcome::Success(key) => key,
            crate::sasl::Outcome::Failure(reason) => {
@@ -408,3 +408,228 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
        }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use bytes::BytesMut;
+    use fallible_iterator::FallibleIterator;
+    use postgres_protocol::{
+        authentication::sasl::{ChannelBinding, ScramSha256},
+        message::{backend::Message as PgMessage, frontend},
+    };
+    use provider::AuthSecret;
+    use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt};
+
+    use crate::{
+        auth::{ComputeUserInfoMaybeEndpoint, IpPattern},
+        config::AuthenticationConfig,
+        console::{
+            self,
+            provider::{self, CachedAllowedIps, CachedRoleSecret},
+            CachedNodeInfo,
+        },
+        context::RequestMonitoring,
+        proxy::NeonOptions,
+        scram::ServerSecret,
+        stream::{PqStream, Stream},
+    };
+
+    use super::auth_quirks;
+
+    struct Auth {
+        ips: Vec<IpPattern>,
+        secret: AuthSecret,
+    }
+
+    impl console::Api for Auth {
+        async fn get_role_secret(
+            &self,
+            _ctx: &mut RequestMonitoring,
+            _user_info: &super::ComputeUserInfo,
+        ) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
+            Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
+        }
+
+        async fn get_allowed_ips_and_secret(
+            &self,
+            _ctx: &mut RequestMonitoring,
+            _user_info: &super::ComputeUserInfo,
+        ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>
+        {
+            Ok((
+                CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())),
+                Some(CachedRoleSecret::new_uncached(Some(self.secret.clone()))),
+            ))
+        }
+
+        async fn wake_compute(
+            &self,
+            _ctx: &mut RequestMonitoring,
+            _user_info: &super::ComputeUserInfo,
+        ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
+            unimplemented!()
+        }
+    }
+
+    static CONFIG: &AuthenticationConfig = &AuthenticationConfig {
+        scram_protocol_timeout: std::time::Duration::from_secs(5),
+    };
+
+    async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
+        loop {
+            r.read_buf(&mut *b).await.unwrap();
+            if let Some(m) = PgMessage::parse(&mut *b).unwrap() {
+                break m;
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn auth_quirks_scram() {
+        let (mut client, server) = tokio::io::duplex(1024);
+        let mut stream = PqStream::new(Stream::from_raw(server));
+
+        let mut ctx = RequestMonitoring::test();
+        let api = Auth {
+            ips: vec![],
+            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
+        };
+
+        let user_info = ComputeUserInfoMaybeEndpoint {
+            user: "conrad".into(),
+            endpoint_id: Some("endpoint".into()),
+            options: NeonOptions::default(),
+        };
+
+        let handle = tokio::spawn(async move {
+            let mut scram = ScramSha256::new(b"my-secret-password", ChannelBinding::unsupported());
+
+            let mut read = BytesMut::new();
+
+            // server should offer scram
+            match read_message(&mut client, &mut read).await {
+                PgMessage::AuthenticationSasl(a) => {
+                    let options: Vec<&str> = a.mechanisms().collect().unwrap();
+                    assert_eq!(options, ["SCRAM-SHA-256"]);
+                }
+                _ => panic!("wrong message"),
+            }
+
+            // client sends client-first-message
+            let mut write = BytesMut::new();
+            frontend::sasl_initial_response("SCRAM-SHA-256", scram.message(), &mut write).unwrap();
+            client.write_all(&write).await.unwrap();
+
+            // server response with server-first-message
+            match read_message(&mut client, &mut read).await {
+                PgMessage::AuthenticationSaslContinue(a) => {
+                    scram.update(a.data()).await.unwrap();
+                }
+                _ => panic!("wrong message"),
+            }
+
+            // client response with client-final-message
+            write.clear();
+            frontend::sasl_response(scram.message(), &mut write).unwrap();
+            client.write_all(&write).await.unwrap();
+
+            // server response with server-final-message
+            match read_message(&mut client, &mut read).await {
+                PgMessage::AuthenticationSaslFinal(a) => {
+                    scram.finish(a.data()).unwrap();
+                }
+                _ => panic!("wrong message"),
+            }
+        });
+
+        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, CONFIG)
+            .await
+            .unwrap();
+
+        handle.await.unwrap();
+    }
+
+    #[tokio::test]
+    async fn auth_quirks_cleartext() {
+        let (mut client, server) = tokio::io::duplex(1024);
+        let mut stream = PqStream::new(Stream::from_raw(server));
+
+        let mut ctx = RequestMonitoring::test();
+        let api = Auth {
+            ips: vec![],
+            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
+        };
+
+        let user_info = ComputeUserInfoMaybeEndpoint {
+            user: "conrad".into(),
+            endpoint_id: Some("endpoint".into()),
+            options: NeonOptions::default(),
+        };
+
+        let handle = tokio::spawn(async move {
+            let mut read = BytesMut::new();
+            let mut write = BytesMut::new();
+
+            // server should offer cleartext
+            match read_message(&mut client, &mut read).await {
+                PgMessage::AuthenticationCleartextPassword => {}
+                _ => panic!("wrong message"),
+            }
+
+            // client responds with password
+            write.clear();
+            frontend::password_message(b"my-secret-password", &mut write).unwrap();
+            client.write_all(&write).await.unwrap();
+        });
+
+        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, CONFIG)
+            .await
+            .unwrap();
+
+        handle.await.unwrap();
+    }
+
+    #[tokio::test]
+    async fn auth_quirks_password_hack() {
+        let (mut client, server) = tokio::io::duplex(1024);
+        let mut stream = PqStream::new(Stream::from_raw(server));
+
+        let mut ctx = RequestMonitoring::test();
+        let api = Auth {
+            ips: vec![],
+            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
+        };
+
+        let user_info = ComputeUserInfoMaybeEndpoint {
+            user: "conrad".into(),
+            endpoint_id: None,
+            options: NeonOptions::default(),
+        };
+
+        let handle = tokio::spawn(async move {
+            let mut read = BytesMut::new();
+
+            // server should offer cleartext
+            match read_message(&mut client, &mut read).await {
+                PgMessage::AuthenticationCleartextPassword => {}
+                _ => panic!("wrong message"),
+            }
+
+            // client responds with password
+            let mut write = BytesMut::new();
+            frontend::password_message(b"endpoint=my-endpoint;my-secret-password", &mut write)
+                .unwrap();
+            client.write_all(&write).await.unwrap();
+        });
+
+        let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, CONFIG)
+            .await
+            .unwrap();
+
+        assert_eq!(creds.info.endpoint, "my-endpoint");
+
+        handle.await.unwrap();
+    }
+}
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -126,7 +126,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
            .strip_suffix(&[0])
            .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;

-        let outcome = validate_password_and_exchange(password, self.state.0)?;
+        let outcome = validate_password_and_exchange(password, self.state.0).await?;

        if let sasl::Outcome::Success(_) = &outcome {
            self.stream.write_message_noflush(&Be::AuthenticationOk)?;
@@ -180,7 +180,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
    }
 }

-pub(crate) fn validate_password_and_exchange(
+pub(crate) async fn validate_password_and_exchange(
    password: &[u8],
    secret: AuthSecret,
 ) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
@@ -194,13 +194,7 @@ pub(crate) fn validate_password_and_exchange(
        }
        // perform scram authentication as both client and server to validate the keys
        AuthSecret::Scram(scram_secret) => {
-            use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
-            let sasl_client = ScramSha256::new(password, ChannelBinding::unsupported());
-            let outcome = crate::scram::exchange(
-                &scram_secret,
-                sasl_client,
-                crate::config::TlsServerEndPoint::Undefined,
-            )?;
+            let outcome = crate::scram::exchange(&scram_secret, password).await?;

            let client_key = match outcome {
                sasl::Outcome::Success(client_key) => client_key,
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,3 +1,10 @@
+use aws_config::environment::EnvironmentVariableCredentialsProvider;
+use aws_config::imds::credentials::ImdsCredentialsProvider;
+use aws_config::meta::credentials::CredentialsProviderChain;
+use aws_config::meta::region::RegionProviderChain;
+use aws_config::profile::ProfileFileCredentialsProvider;
+use aws_config::provider_config::ProviderConfig;
+use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
 use futures::future::Either;
 use proxy::auth;
 use proxy::auth::backend::MaybeOwned;
@@ -10,11 +17,14 @@ use proxy::config::ProjectInfoCacheOptions;
 use proxy::console;
 use proxy::context::parquet::ParquetUploadArgs;
 use proxy::http;
+use proxy::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT;
 use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
 use proxy::rate_limiter::RateLimiterConfig;
+use proxy::redis::cancellation_publisher::RedisPublisherClient;
+use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
+use proxy::redis::elasticache;
 use proxy::redis::notifications;
-use proxy::redis::publisher::RedisPublisherClient;
 use proxy::serverless::GlobalConnPoolOptions;
 use proxy::usage_metrics;

@@ -150,9 +160,24 @@ struct ProxyCliArgs {
    /// disable ip check for http requests. If it is too time consuming, it could be turned off.
    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    disable_ip_check_for_http: bool,
-    /// redis url for notifications.
+    /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
    #[clap(long)]
    redis_notifications: Option<String>,
+    /// redis host for streaming connections (might be different from the notifications host)
+    #[clap(long)]
+    redis_host: Option<String>,
+    /// redis port for streaming connections (might be different from the notifications host)
+    #[clap(long)]
+    redis_port: Option<u16>,
+    /// redis cluster name, used in aws elasticache
+    #[clap(long)]
+    redis_cluster_name: Option<String>,
+    /// redis user_id, used in aws elasticache
+    #[clap(long)]
+    redis_user_id: Option<String>,
+    /// aws region to retrieve credentials
+    #[clap(long, default_value_t = String::new())]
+    aws_region: String,
    /// cache for `project_info` (use `size=0` to disable)
    #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
    project_info_cache: String,
@@ -216,6 +241,61 @@ async fn main() -> anyhow::Result<()> {
    let config = build_config(&args)?;

    info!("Authentication backend: {}", config.auth_backend);
+    info!("Using region: {}", config.aws_region);
+
+    let region_provider = RegionProviderChain::default_provider().or_else(&*config.aws_region); // Replace with your Redis region if needed
+    let provider_conf =
+        ProviderConfig::without_region().with_region(region_provider.region().await);
+    let aws_credentials_provider = {
+        // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
+        CredentialsProviderChain::first_try("env", EnvironmentVariableCredentialsProvider::new())
+            // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
+            .or_else(
+                "profile-sso",
+                ProfileFileCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build(),
+            )
+            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
+            // needed to access remote extensions bucket
+            .or_else(
+                "token",
+                WebIdentityTokenCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build(),
+            )
+            // uses imds v2
+            .or_else("imds", ImdsCredentialsProvider::builder().build())
+    };
+    let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new(
+        elasticache::AWSIRSAConfig::new(
+            config.aws_region.clone(),
+            args.redis_cluster_name,
+            args.redis_user_id,
+        ),
+        aws_credentials_provider,
+    ));
+    let redis_notifications_client =
+        match (args.redis_notifications, (args.redis_host, args.redis_port)) {
+            (Some(url), _) => {
+                info!("Starting redis notifications listener ({url})");
+                Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
+            }
+            (None, (Some(host), Some(port))) => Some(
+                ConnectionWithCredentialsProvider::new_with_credentials_provider(
+                    host,
+                    port,
+                    elasticache_credentials_provider.clone(),
+                ),
+            ),
+            (None, (None, None)) => {
+                warn!("Redis is disabled");
+                None
+            }
+            _ => {
+                bail!("redis-host and redis-port must be specified together");
+            }
+        };

    // Check that we can bind to address before further initialization
    let http_address: SocketAddr = args.http.parse()?;
@@ -233,17 +313,22 @@ async fn main() -> anyhow::Result<()> {

    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
    let cancel_map = CancelMap::default();
-    let redis_publisher = match &args.redis_notifications {
-        Some(url) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
-            url,
+
+    // let redis_notifications_client = redis_notifications_client.map(|x| Box::leak(Box::new(x)));
+    let redis_publisher = match &redis_notifications_client {
+        Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
+            redis_publisher.clone(),
            args.region.clone(),
            &config.redis_rps_limit,
        )?))),
        None => None,
    };
-    let cancellation_handler = Arc::new(CancellationHandler::new(
+    let cancellation_handler = Arc::new(CancellationHandler::<
+        Option<Arc<tokio::sync::Mutex<RedisPublisherClient>>>,
+    >::new(
        cancel_map.clone(),
        redis_publisher,
+        NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT,
    ));

    // client facing tasks. these will exit on error or on cancellation
@@ -290,17 +375,16 @@ async fn main() -> anyhow::Result<()> {

    if let auth::BackendType::Console(api, _) = &config.auth_backend {
        if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
-            let cache = api.caches.project_info.clone();
-            if let Some(url) = args.redis_notifications {
-                info!("Starting redis notifications listener ({url})");
+            if let Some(redis_notifications_client) = redis_notifications_client {
+                let cache = api.caches.project_info.clone();
                maintenance_tasks.spawn(notifications::task_main(
-                    url.to_owned(),
+                    redis_notifications_client.clone(),
                    cache.clone(),
                    cancel_map.clone(),
                    args.region.clone(),
                ));
+                maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
            }
-            maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
        }
    }

@@ -445,8 +529,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        endpoint_rps_limit,
        redis_rps_limit,
        handshake_timeout: args.handshake_timeout,
-        // TODO: add this argument
        region: args.region.clone(),
+        aws_region: args.aws_region.clone(),
    }));

    Ok(config)
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,4 +1,3 @@
-use async_trait::async_trait;
 use dashmap::DashMap;
 use pq_proto::CancelKeyData;
 use std::{net::SocketAddr, sync::Arc};
@@ -10,18 +9,26 @@ use tracing::info;
 use uuid::Uuid;

 use crate::{
-    error::ReportableError, metrics::NUM_CANCELLATION_REQUESTS,
-    redis::publisher::RedisPublisherClient,
+    error::ReportableError,
+    metrics::NUM_CANCELLATION_REQUESTS,
+    redis::cancellation_publisher::{
+        CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
+    },
 };

 pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
+pub type CancellationHandlerMain = CancellationHandler<Option<Arc<Mutex<RedisPublisherClient>>>>;
+pub type CancellationHandlerMainInternal = Option<Arc<Mutex<RedisPublisherClient>>>;

 /// Enables serving `CancelRequest`s.
 ///
-/// If there is a `RedisPublisherClient` available, it will be used to publish the cancellation key to other proxy instances.
-pub struct CancellationHandler {
+/// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances.
+pub struct CancellationHandler<P> {
    map: CancelMap,
-    redis_client: Option<Arc<Mutex<RedisPublisherClient>>>,
+    client: P,
+    /// This field used for the monitoring purposes.
+    /// Represents the source of the cancellation request.
+    from: &'static str,
 }

 #[derive(Debug, Error)]
@@ -44,49 +51,9 @@ impl ReportableError for CancelError {
    }
 }

-impl CancellationHandler {
-    pub fn new(map: CancelMap, redis_client: Option<Arc<Mutex<RedisPublisherClient>>>) -> Self {
-        Self { map, redis_client }
-    }
-    /// Cancel a running query for the corresponding connection.
-    pub async fn cancel_session(
-        &self,
-        key: CancelKeyData,
-        session_id: Uuid,
-    ) -> Result<(), CancelError> {
-        let from = "from_client";
-        // NB: we should immediately release the lock after cloning the token.
-        let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
-            tracing::warn!("query cancellation key not found: {key}");
-            if let Some(redis_client) = &self.redis_client {
-                NUM_CANCELLATION_REQUESTS
-                    .with_label_values(&[from, "not_found"])
-                    .inc();
-                info!("publishing cancellation key to Redis");
-                match redis_client.lock().await.try_publish(key, session_id).await {
-                    Ok(()) => {
-                        info!("cancellation key successfuly published to Redis");
-                    }
-                    Err(e) => {
-                        tracing::error!("failed to publish a message: {e}");
-                        return Err(CancelError::IO(std::io::Error::new(
-                            std::io::ErrorKind::Other,
-                            e.to_string(),
-                        )));
-                    }
-                }
-            }
-            return Ok(());
-        };
-        NUM_CANCELLATION_REQUESTS
-            .with_label_values(&[from, "found"])
-            .inc();
-        info!("cancelling query per user's request using key {key}");
-        cancel_closure.try_cancel_query().await
-    }
-
+impl<P: CancellationPublisher> CancellationHandler<P> {
    /// Run async action within an ephemeral session identified by [`CancelKeyData`].
-    pub fn get_session(self: Arc<Self>) -> Session {
+    pub fn get_session(self: Arc<Self>) -> Session<P> {
        // HACK: We'd rather get the real backend_pid but tokio_postgres doesn't
        // expose it and we don't want to do another roundtrip to query
        // for it. The client will be able to notice that this is not the
@@ -112,9 +79,39 @@ impl CancellationHandler {
            cancellation_handler: self,
        }
    }
+    /// Try to cancel a running query for the corresponding connection.
+    /// If the cancellation key is not found, it will be published to Redis.
+    pub async fn cancel_session(
+        &self,
+        key: CancelKeyData,
+        session_id: Uuid,
+    ) -> Result<(), CancelError> {
+        // NB: we should immediately release the lock after cloning the token.
+        let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
+            tracing::warn!("query cancellation key not found: {key}");
+            NUM_CANCELLATION_REQUESTS
+                .with_label_values(&[self.from, "not_found"])
+                .inc();
+            match self.client.try_publish(key, session_id).await {
+                Ok(()) => {} // do nothing
+                Err(e) => {
+                    return Err(CancelError::IO(std::io::Error::new(
+                        std::io::ErrorKind::Other,
+                        e.to_string(),
+                    )));
+                }
+            }
+            return Ok(());
+        };
+        NUM_CANCELLATION_REQUESTS
+            .with_label_values(&[self.from, "found"])
+            .inc();
+        info!("cancelling query per user's request using key {key}");
+        cancel_closure.try_cancel_query().await
+    }

    #[cfg(test)]
-    fn contains(&self, session: &Session) -> bool {
+    fn contains(&self, session: &Session<P>) -> bool {
        self.map.contains_key(&session.key)
    }

@@ -124,31 +121,19 @@ impl CancellationHandler {
    }
 }

-#[async_trait]
-pub trait NotificationsCancellationHandler {
-    async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError>;
+impl CancellationHandler<()> {
+    pub fn new(map: CancelMap, from: &'static str) -> Self {
+        Self {
+            map,
+            client: (),
+            from,
+        }
+    }
 }

-#[async_trait]
-impl NotificationsCancellationHandler for CancellationHandler {
-    async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError> {
-        let from = "from_redis";
-        let cancel_closure = self.map.get(&key).and_then(|x| x.clone());
-        match cancel_closure {
-            Some(cancel_closure) => {
-                NUM_CANCELLATION_REQUESTS
-                    .with_label_values(&[from, "found"])
-                    .inc();
-                cancel_closure.try_cancel_query().await
-            }
-            None => {
-                NUM_CANCELLATION_REQUESTS
-                    .with_label_values(&[from, "not_found"])
-                    .inc();
-                tracing::warn!("query cancellation key not found: {key}");
-                Ok(())
-            }
-        }
+impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
+    pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: &'static str) -> Self {
+        Self { map, client, from }
    }
 }

@@ -178,14 +163,14 @@ impl CancelClosure {
 }

 /// Helper for registering query cancellation tokens.
-pub struct Session {
+pub struct Session<P> {
    /// The user-facing key identifying this session.
    key: CancelKeyData,
    /// The [`CancelMap`] this session belongs to.
-    cancellation_handler: Arc<CancellationHandler>,
+    cancellation_handler: Arc<CancellationHandler<P>>,
 }

-impl Session {
+impl<P> Session<P> {
    /// Store the cancel token for the given session.
    /// This enables query cancellation in `crate::proxy::prepare_client_connection`.
    pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
@@ -198,7 +183,7 @@ impl Session {
    }
 }

-impl Drop for Session {
+impl<P> Drop for Session<P> {
    fn drop(&mut self) {
        self.cancellation_handler.map.remove(&self.key);
        info!("dropped query cancellation key {}", &self.key);
@@ -207,14 +192,16 @@ impl Drop for Session {

 #[cfg(test)]
 mod tests {
+    use crate::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS;
+
    use super::*;

    #[tokio::test]
    async fn check_session_drop() -> anyhow::Result<()> {
-        let cancellation_handler = Arc::new(CancellationHandler {
-            map: CancelMap::default(),
-            redis_client: None,
-        });
+        let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
+            CancelMap::default(),
+            NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS,
+        ));

        let session = cancellation_handler.clone().get_session();
        assert!(cancellation_handler.contains(&session));
@@ -224,4 +211,19 @@ mod tests {

        Ok(())
    }
+
+    #[tokio::test]
+    async fn cancel_session_noop_regression() {
+        let handler = CancellationHandler::<()>::new(Default::default(), "local");
+        handler
+            .cancel_session(
+                CancelKeyData {
+                    backend_pid: 0,
+                    cancel_key: 0,
+                },
+                Uuid::new_v4(),
+            )
+            .await
+            .unwrap();
+    }
 }
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -82,14 +82,13 @@ pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;
 /// A config for establishing a connection to compute node.
 /// Eventually, `tokio_postgres` will be replaced with something better.
 /// Newtype allows us to implement methods on top of it.
-#[derive(Clone)]
-#[repr(transparent)]
+#[derive(Clone, Default)]
 pub struct ConnCfg(Box<tokio_postgres::Config>);

 /// Creation and initialization routines.
 impl ConnCfg {
    pub fn new() -> Self {
-        Self(Default::default())
+        Self::default()
    }

    /// Reuse password or auth keys from the other config.
@@ -165,12 +164,6 @@ impl std::ops::DerefMut for ConnCfg {
    }
 }

-impl Default for ConnCfg {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 impl ConnCfg {
    /// Establish a raw TCP connection to the compute node.
    async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> {
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -28,6 +28,7 @@ pub struct ProxyConfig {
    pub redis_rps_limit: Vec<RateBucketInfo>,
    pub region: String,
    pub handshake_timeout: Duration,
+    pub aws_region: String,
 }

 #[derive(Debug)]
--- a/proxy/src/console.rs
+++ b/proxy/src/console.rs
@@ -6,7 +6,7 @@ pub mod messages;

 /// Wrappers for console APIs and their mocks.
 pub mod provider;
-pub use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo};
+pub(crate) use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo};

 /// Various cache-related types.
 pub mod caches {
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -14,7 +14,6 @@ use crate::{
    context::RequestMonitoring,
    scram, EndpointCacheKey, ProjectId,
 };
-use async_trait::async_trait;
 use dashmap::DashMap;
 use std::{sync::Arc, time::Duration};
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
@@ -326,8 +325,7 @@ pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPatt

 /// This will allocate per each call, but the http requests alone
 /// already require a few allocations, so it should be fine.
-#[async_trait]
-pub trait Api {
+pub(crate) trait Api {
    /// Get the client's auth secret for authentication.
    /// Returns option because user not found situation is special.
    /// We still have to mock the scram to avoid leaking information that user doesn't exist.
@@ -363,7 +361,6 @@ pub enum ConsoleBackend {
    Test(Box<dyn crate::auth::backend::TestBackend>),
 }

-#[async_trait]
 impl Api for ConsoleBackend {
    async fn get_role_secret(
        &self,
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -8,7 +8,6 @@ use crate::console::provider::{CachedAllowedIps, CachedRoleSecret};
 use crate::context::RequestMonitoring;
 use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
 use crate::{auth::IpPattern, cache::Cached};
-use async_trait::async_trait;
 use futures::TryFutureExt;
 use std::{str::FromStr, sync::Arc};
 use thiserror::Error;
@@ -144,7 +143,6 @@ async fn get_execute_postgres_query(
    Ok(Some(entry))
 }

-#[async_trait]
 impl super::Api for Api {
    #[tracing::instrument(skip_all)]
    async fn get_role_secret(
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -14,7 +14,6 @@ use crate::{
    context::RequestMonitoring,
    metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER},
 };
-use async_trait::async_trait;
 use futures::TryFutureExt;
 use std::sync::Arc;
 use tokio::time::Instant;
@@ -168,7 +167,6 @@ impl Api {
    }
 }

-#[async_trait]
 impl super::Api for Api {
    #[tracing::instrument(skip_all)]
    async fn get_role_secret(
--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
@@ -2,14 +2,21 @@ use anyhow::{anyhow, bail};
 use hyper::{Body, Request, Response, StatusCode};
 use std::{convert::Infallible, net::TcpListener};
 use tracing::info;
-use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService};
+use utils::http::{
+    endpoint::{self, prometheus_metrics_handler, request_span},
+    error::ApiError,
+    json::json_response,
+    RouterBuilder, RouterService,
+};

 async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(StatusCode::OK, "")
 }

 fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
-    endpoint::make_router().get("/v1/status", status_handler)
+    endpoint::make_router()
+        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .get("/v1/status", status_handler)
 }

 pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible> {
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -161,6 +161,9 @@ pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .unwrap()
 });

+pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client";
+pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis";
+
 pub enum Waiting {
    Cplane,
    Client,
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -10,7 +10,7 @@ pub mod wake_compute;

 use crate::{
    auth,
-    cancellation::{self, CancellationHandler},
+    cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal},
    compute,
    config::{ProxyConfig, TlsConfig},
    context::RequestMonitoring,
@@ -62,7 +62,7 @@ pub async fn task_main(
    listener: tokio::net::TcpListener,
    cancellation_token: CancellationToken,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-    cancellation_handler: Arc<CancellationHandler>,
+    cancellation_handler: Arc<CancellationHandlerMain>,
 ) -> anyhow::Result<()> {
    scopeguard::defer! {
        info!("proxy has shut down");
@@ -233,12 +233,12 @@ impl ReportableError for ClientRequestError {
 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    config: &'static ProxyConfig,
    ctx: &mut RequestMonitoring,
-    cancellation_handler: Arc<CancellationHandler>,
+    cancellation_handler: Arc<CancellationHandlerMain>,
    stream: S,
    mode: ClientMode,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
    conn_gauge: IntCounterPairGuard,
-) -> Result<Option<ProxyPassthrough<S>>, ClientRequestError> {
+) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
    info!("handling interactive connection from client");

    let proto = ctx.protocol;
@@ -338,9 +338,9 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(

 /// Finish client connection initialization: confirm auth success, send params, etc.
 #[tracing::instrument(skip_all)]
-async fn prepare_client_connection(
+async fn prepare_client_connection<P>(
    node: &compute::PostgresConnection,
-    session: &cancellation::Session,
+    session: &cancellation::Session<P>,
    stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> Result<(), std::io::Error> {
    // Register compute's query cancellation token and produce a new, unique one.
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -55,17 +55,17 @@ pub async fn proxy_pass(
    Ok(())
 }

-pub struct ProxyPassthrough<S> {
+pub struct ProxyPassthrough<P, S> {
    pub client: Stream<S>,
    pub compute: PostgresConnection,
    pub aux: MetricsAuxInfo,

    pub req: IntCounterPairGuard,
    pub conn: IntCounterPairGuard,
-    pub cancel: cancellation::Session,
+    pub cancel: cancellation::Session<P>,
 }

-impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
+impl<P, S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<P, S> {
    pub async fn proxy_pass(self) -> anyhow::Result<()> {
        let res = proxy_pass(self.client, self.compute.stream, self.aux).await;
        self.compute.cancel_closure.try_cancel_query().await?;
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -135,9 +135,10 @@ impl TestAuth for NoAuth {}
 struct Scram(scram::ServerSecret);

 impl Scram {
-    fn new(password: &str) -> anyhow::Result<Self> {
-        let secret =
-            scram::ServerSecret::build(password).context("failed to generate scram secret")?;
+    async fn new(password: &str) -> anyhow::Result<Self> {
+        let secret = scram::ServerSecret::build(password)
+            .await
+            .context("failed to generate scram secret")?;
        Ok(Scram(secret))
    }

@@ -284,7 +285,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
    let proxy = tokio::spawn(dummy_proxy(
        client,
        Some(server_config),
-        Scram::new(password)?,
+        Scram::new(password).await?,
    ));

    let (_client, _conn) = tokio_postgres::Config::new()
@@ -308,7 +309,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
    let proxy = tokio::spawn(dummy_proxy(
        client,
        Some(server_config),
-        Scram::new("password")?,
+        Scram::new("password").await?,
    ));

    let (_client, _conn) = tokio_postgres::Config::new()
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -148,7 +148,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
    let proxy = tokio::spawn(dummy_proxy(
        client,
        Some(server_config),
-        Scram::new("password")?,
+        Scram::new("password").await?,
    ));

    let _client_err = tokio_postgres::Config::new()
@@ -231,7 +231,7 @@ async fn connect_failure(
    let proxy = tokio::spawn(dummy_proxy(
        client,
        Some(server_config),
-        Scram::new("password")?,
+        Scram::new("password").await?,
    ));

    let _client_err = tokio_postgres::Config::new()
--- a/proxy/src/redis.rs
+++ b/proxy/src/redis.rs
@@ -1,2 +1,4 @@
+pub mod cancellation_publisher;
+pub mod connection_with_credentials_provider;
+pub mod elasticache;
 pub mod notifications;
-pub mod publisher;
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -0,0 +1,161 @@
+use std::sync::Arc;
+
+use pq_proto::CancelKeyData;
+use redis::AsyncCommands;
+use tokio::sync::Mutex;
+use uuid::Uuid;
+
+use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
+
+use super::{
+    connection_with_credentials_provider::ConnectionWithCredentialsProvider,
+    notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME},
+};
+
+pub trait CancellationPublisherMut: Send + Sync + 'static {
+    #[allow(async_fn_in_trait)]
+    async fn try_publish(
+        &mut self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()>;
+}
+
+pub trait CancellationPublisher: Send + Sync + 'static {
+    #[allow(async_fn_in_trait)]
+    async fn try_publish(
+        &self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()>;
+}
+
+impl CancellationPublisher for () {
+    async fn try_publish(
+        &self,
+        _cancel_key_data: CancelKeyData,
+        _session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        Ok(())
+    }
+}
+
+impl<P: CancellationPublisher> CancellationPublisherMut for P {
+    async fn try_publish(
+        &mut self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        <P as CancellationPublisher>::try_publish(self, cancel_key_data, session_id).await
+    }
+}
+
+impl<P: CancellationPublisher> CancellationPublisher for Option<P> {
+    async fn try_publish(
+        &self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        if let Some(p) = self {
+            p.try_publish(cancel_key_data, session_id).await
+        } else {
+            Ok(())
+        }
+    }
+}
+
+impl<P: CancellationPublisherMut> CancellationPublisher for Arc<Mutex<P>> {
+    async fn try_publish(
+        &self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        self.lock()
+            .await
+            .try_publish(cancel_key_data, session_id)
+            .await
+    }
+}
+
+pub struct RedisPublisherClient {
+    client: ConnectionWithCredentialsProvider,
+    region_id: String,
+    limiter: RedisRateLimiter,
+}
+
+impl RedisPublisherClient {
+    pub fn new(
+        client: ConnectionWithCredentialsProvider,
+        region_id: String,
+        info: &'static [RateBucketInfo],
+    ) -> anyhow::Result<Self> {
+        Ok(Self {
+            client,
+            region_id,
+            limiter: RedisRateLimiter::new(info),
+        })
+    }
+
+    async fn publish(
+        &mut self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        let payload = serde_json::to_string(&Notification::Cancel(CancelSession {
+            region_id: Some(self.region_id.clone()),
+            cancel_key_data,
+            session_id,
+        }))?;
+        self.client.publish(PROXY_CHANNEL_NAME, payload).await?;
+        Ok(())
+    }
+    pub async fn try_connect(&mut self) -> anyhow::Result<()> {
+        match self.client.connect().await {
+            Ok(()) => {}
+            Err(e) => {
+                tracing::error!("failed to connect to redis: {e}");
+                return Err(e);
+            }
+        }
+        Ok(())
+    }
+    async fn try_publish_internal(
+        &mut self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        if !self.limiter.check() {
+            tracing::info!("Rate limit exceeded. Skipping cancellation message");
+            return Err(anyhow::anyhow!("Rate limit exceeded"));
+        }
+        match self.publish(cancel_key_data, session_id).await {
+            Ok(()) => return Ok(()),
+            Err(e) => {
+                tracing::error!("failed to publish a message: {e}");
+            }
+        }
+        tracing::info!("Publisher is disconnected. Reconnectiong...");
+        self.try_connect().await?;
+        self.publish(cancel_key_data, session_id).await
+    }
+}
+
+impl CancellationPublisherMut for RedisPublisherClient {
+    async fn try_publish(
+        &mut self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        tracing::info!("publishing cancellation key to Redis");
+        match self.try_publish_internal(cancel_key_data, session_id).await {
+            Ok(()) => {
+                tracing::info!("cancellation key successfuly published to Redis");
+                Ok(())
+            }
+            Err(e) => {
+                tracing::error!("failed to publish a message: {e}");
+                Err(e)
+            }
+        }
+    }
+}
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -0,0 +1,225 @@
+use std::{sync::Arc, time::Duration};
+
+use futures::FutureExt;
+use redis::{
+    aio::{ConnectionLike, MultiplexedConnection},
+    ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult,
+};
+use tokio::task::JoinHandle;
+use tracing::{error, info};
+
+use super::elasticache::CredentialsProvider;
+
+enum Credentials {
+    Static(ConnectionInfo),
+    Dynamic(Arc<CredentialsProvider>, redis::ConnectionAddr),
+}
+
+impl Clone for Credentials {
+    fn clone(&self) -> Self {
+        match self {
+            Credentials::Static(info) => Credentials::Static(info.clone()),
+            Credentials::Dynamic(provider, addr) => {
+                Credentials::Dynamic(Arc::clone(provider), addr.clone())
+            }
+        }
+    }
+}
+
+/// A wrapper around `redis::MultiplexedConnection` that automatically refreshes the token.
+/// Provides PubSub connection without credentials refresh.
+pub struct ConnectionWithCredentialsProvider {
+    credentials: Credentials,
+    con: Option<MultiplexedConnection>,
+    refresh_token_task: Option<JoinHandle<()>>,
+    mutex: tokio::sync::Mutex<()>,
+}
+
+impl Clone for ConnectionWithCredentialsProvider {
+    fn clone(&self) -> Self {
+        Self {
+            credentials: self.credentials.clone(),
+            con: None,
+            refresh_token_task: None,
+            mutex: tokio::sync::Mutex::new(()),
+        }
+    }
+}
+
+impl ConnectionWithCredentialsProvider {
+    pub fn new_with_credentials_provider(
+        host: String,
+        port: u16,
+        credentials_provider: Arc<CredentialsProvider>,
+    ) -> Self {
+        Self {
+            credentials: Credentials::Dynamic(
+                credentials_provider,
+                redis::ConnectionAddr::TcpTls {
+                    host,
+                    port,
+                    insecure: false,
+                    tls_params: None,
+                },
+            ),
+            con: None,
+            refresh_token_task: None,
+            mutex: tokio::sync::Mutex::new(()),
+        }
+    }
+
+    pub fn new_with_static_credentials<T: IntoConnectionInfo>(params: T) -> Self {
+        Self {
+            credentials: Credentials::Static(params.into_connection_info().unwrap()),
+            con: None,
+            refresh_token_task: None,
+            mutex: tokio::sync::Mutex::new(()),
+        }
+    }
+
+    pub async fn connect(&mut self) -> anyhow::Result<()> {
+        let _guard = self.mutex.lock().await;
+        if let Some(con) = self.con.as_mut() {
+            match redis::cmd("PING").query_async(con).await {
+                Ok(()) => {
+                    return Ok(());
+                }
+                Err(e) => {
+                    error!("Error during PING: {e:?}");
+                }
+            }
+        } else {
+            info!("Connection is not established");
+        }
+        info!("Establishing a new connection...");
+        self.con = None;
+        if let Some(f) = self.refresh_token_task.take() {
+            f.abort()
+        }
+        let con = self
+            .get_client()
+            .await?
+            .get_multiplexed_tokio_connection()
+            .await?;
+        if let Credentials::Dynamic(credentials_provider, _) = &self.credentials {
+            let credentials_provider = credentials_provider.clone();
+            let con2 = con.clone();
+            let f = tokio::spawn(async move {
+                let _ = Self::keep_connection(con2, credentials_provider).await;
+            });
+            self.refresh_token_task = Some(f);
+        }
+        self.con = Some(con);
+        Ok(())
+    }
+
+    async fn get_connection_info(&self) -> anyhow::Result<ConnectionInfo> {
+        match &self.credentials {
+            Credentials::Static(info) => Ok(info.clone()),
+            Credentials::Dynamic(provider, addr) => {
+                let (username, password) = provider.provide_credentials().await?;
+                Ok(ConnectionInfo {
+                    addr: addr.clone(),
+                    redis: RedisConnectionInfo {
+                        db: 0,
+                        username: Some(username),
+                        password: Some(password.clone()),
+                    },
+                })
+            }
+        }
+    }
+
+    async fn get_client(&self) -> anyhow::Result<redis::Client> {
+        let client = redis::Client::open(self.get_connection_info().await?)?;
+        Ok(client)
+    }
+
+    // PubSub does not support credentials refresh.
+    // Requires manual reconnection every 12h.
+    pub async fn get_async_pubsub(&self) -> anyhow::Result<redis::aio::PubSub> {
+        Ok(self.get_client().await?.get_async_pubsub().await?)
+    }
+
+    // The connection lives for 12h.
+    // It can be prolonged with sending `AUTH` commands with the refreshed token.
+    // https://docs.aws.amazon.com/AmazonElastiCache/latest/red-ug/auth-iam.html#auth-iam-limits
+    async fn keep_connection(
+        mut con: MultiplexedConnection,
+        credentials_provider: Arc<CredentialsProvider>,
+    ) -> anyhow::Result<()> {
+        loop {
+            // The connection lives for 12h, for the sanity check we refresh it every hour.
+            tokio::time::sleep(Duration::from_secs(60 * 60)).await;
+            match Self::refresh_token(&mut con, credentials_provider.clone()).await {
+                Ok(()) => {
+                    info!("Token refreshed");
+                }
+                Err(e) => {
+                    error!("Error during token refresh: {e:?}");
+                }
+            }
+        }
+    }
+    async fn refresh_token(
+        con: &mut MultiplexedConnection,
+        credentials_provider: Arc<CredentialsProvider>,
+    ) -> anyhow::Result<()> {
+        let (user, password) = credentials_provider.provide_credentials().await?;
+        redis::cmd("AUTH")
+            .arg(user)
+            .arg(password)
+            .query_async(con)
+            .await?;
+        Ok(())
+    }
+    /// Sends an already encoded (packed) command into the TCP socket and
+    /// reads the single response from it.
+    pub async fn send_packed_command(&mut self, cmd: &redis::Cmd) -> RedisResult<redis::Value> {
+        // Clone connection to avoid having to lock the ArcSwap in write mode
+        let con = self.con.as_mut().ok_or(redis::RedisError::from((
+            redis::ErrorKind::IoError,
+            "Connection not established",
+        )))?;
+        con.send_packed_command(cmd).await
+    }
+
+    /// Sends multiple already encoded (packed) command into the TCP socket
+    /// and reads `count` responses from it.  This is used to implement
+    /// pipelining.
+    pub async fn send_packed_commands(
+        &mut self,
+        cmd: &redis::Pipeline,
+        offset: usize,
+        count: usize,
+    ) -> RedisResult<Vec<redis::Value>> {
+        // Clone shared connection future to avoid having to lock the ArcSwap in write mode
+        let con = self.con.as_mut().ok_or(redis::RedisError::from((
+            redis::ErrorKind::IoError,
+            "Connection not established",
+        )))?;
+        con.send_packed_commands(cmd, offset, count).await
+    }
+}
+
+impl ConnectionLike for ConnectionWithCredentialsProvider {
+    fn req_packed_command<'a>(
+        &'a mut self,
+        cmd: &'a redis::Cmd,
+    ) -> redis::RedisFuture<'a, redis::Value> {
+        (async move { self.send_packed_command(cmd).await }).boxed()
+    }
+
+    fn req_packed_commands<'a>(
+        &'a mut self,
+        cmd: &'a redis::Pipeline,
+        offset: usize,
+        count: usize,
+    ) -> redis::RedisFuture<'a, Vec<redis::Value>> {
+        (async move { self.send_packed_commands(cmd, offset, count).await }).boxed()
+    }
+
+    fn get_db(&self) -> i64 {
+        0
+    }
+}
--- a/proxy/src/redis/elasticache.rs
+++ b/proxy/src/redis/elasticache.rs
@@ -0,0 +1,110 @@
+use std::time::{Duration, SystemTime};
+
+use aws_config::meta::credentials::CredentialsProviderChain;
+use aws_sdk_iam::config::ProvideCredentials;
+use aws_sigv4::http_request::{
+    self, SignableBody, SignableRequest, SignatureLocation, SigningSettings,
+};
+use tracing::info;
+
+#[derive(Debug)]
+pub struct AWSIRSAConfig {
+    region: String,
+    service_name: String,
+    cluster_name: String,
+    user_id: String,
+    token_ttl: Duration,
+    action: String,
+}
+
+impl AWSIRSAConfig {
+    pub fn new(region: String, cluster_name: Option<String>, user_id: Option<String>) -> Self {
+        AWSIRSAConfig {
+            region,
+            service_name: "elasticache".to_string(),
+            cluster_name: cluster_name.unwrap_or_default(),
+            user_id: user_id.unwrap_or_default(),
+            // "The IAM authentication token is valid for 15 minutes"
+            // https://docs.aws.amazon.com/memorydb/latest/devguide/auth-iam.html#auth-iam-limits
+            token_ttl: Duration::from_secs(15 * 60),
+            action: "connect".to_string(),
+        }
+    }
+}
+
+/// Credentials provider for AWS elasticache authentication.
+///
+/// Official documentation:
+/// <https://docs.aws.amazon.com/AmazonElastiCache/latest/red-ug/auth-iam.html>
+///
+/// Useful resources:
+/// <https://aws.amazon.com/blogs/database/simplify-managing-access-to-amazon-elasticache-for-redis-clusters-with-iam/>
+pub struct CredentialsProvider {
+    config: AWSIRSAConfig,
+    credentials_provider: CredentialsProviderChain,
+}
+
+impl CredentialsProvider {
+    pub fn new(config: AWSIRSAConfig, credentials_provider: CredentialsProviderChain) -> Self {
+        CredentialsProvider {
+            config,
+            credentials_provider,
+        }
+    }
+    pub async fn provide_credentials(&self) -> anyhow::Result<(String, String)> {
+        let aws_credentials = self
+            .credentials_provider
+            .provide_credentials()
+            .await?
+            .into();
+        info!("AWS credentials successfully obtained");
+        info!("Connecting to Redis with configuration: {:?}", self.config);
+        let mut settings = SigningSettings::default();
+        settings.signature_location = SignatureLocation::QueryParams;
+        settings.expires_in = Some(self.config.token_ttl);
+        let signing_params = aws_sigv4::sign::v4::SigningParams::builder()
+            .identity(&aws_credentials)
+            .region(&self.config.region)
+            .name(&self.config.service_name)
+            .time(SystemTime::now())
+            .settings(settings)
+            .build()?
+            .into();
+        let auth_params = [
+            ("Action", &self.config.action),
+            ("User", &self.config.user_id),
+        ];
+        let auth_params = url::form_urlencoded::Serializer::new(String::new())
+            .extend_pairs(auth_params)
+            .finish();
+        let auth_uri = http::Uri::builder()
+            .scheme("http")
+            .authority(self.config.cluster_name.as_bytes())
+            .path_and_query(format!("/?{auth_params}"))
+            .build()?;
+        info!("{}", auth_uri);
+
+        // Convert the HTTP request into a signable request
+        let signable_request = SignableRequest::new(
+            "GET",
+            auth_uri.to_string(),
+            std::iter::empty(),
+            SignableBody::Bytes(&[]),
+        )?;
+
+        // Sign and then apply the signature to the request
+        let (si, _) = http_request::sign(signable_request, &signing_params)?.into_parts();
+        let mut signable_request = http::Request::builder()
+            .method("GET")
+            .uri(auth_uri)
+            .body(())?;
+        si.apply_to_request_http1x(&mut signable_request);
+        Ok((
+            self.config.user_id.clone(),
+            signable_request
+                .uri()
+                .to_string()
+                .replacen("http://", "", 1),
+        ))
+    }
+}
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -6,11 +6,12 @@ use redis::aio::PubSub;
 use serde::{Deserialize, Serialize};
 use uuid::Uuid;

+use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use crate::{
    cache::project_info::ProjectInfoCache,
-    cancellation::{CancelMap, CancellationHandler, NotificationsCancellationHandler},
+    cancellation::{CancelMap, CancellationHandler},
    intern::{ProjectIdInt, RoleNameInt},
-    metrics::REDIS_BROKEN_MESSAGES,
+    metrics::{NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, REDIS_BROKEN_MESSAGES},
 };

 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
@@ -18,23 +19,13 @@ pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates";
 const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
 const INVALIDATION_LAG: std::time::Duration = std::time::Duration::from_secs(20);

-struct RedisConsumerClient {
-    client: redis::Client,
-}
-
-impl RedisConsumerClient {
-    pub fn new(url: &str) -> anyhow::Result<Self> {
-        let client = redis::Client::open(url)?;
-        Ok(Self { client })
-    }
-    async fn try_connect(&self) -> anyhow::Result<PubSub> {
-        let mut conn = self.client.get_async_connection().await?.into_pubsub();
-        tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`");
-        conn.subscribe(CPLANE_CHANNEL_NAME).await?;
-        tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`");
-        conn.subscribe(PROXY_CHANNEL_NAME).await?;
-        Ok(conn)
-    }
+async fn try_connect(client: &ConnectionWithCredentialsProvider) -> anyhow::Result<PubSub> {
+    let mut conn = client.get_async_pubsub().await?;
+    tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`");
+    conn.subscribe(CPLANE_CHANNEL_NAME).await?;
+    tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`");
+    conn.subscribe(PROXY_CHANNEL_NAME).await?;
+    Ok(conn)
 }

 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
@@ -80,21 +71,18 @@ where
    serde_json::from_str(&s).map_err(<D::Error as serde::de::Error>::custom)
 }

-struct MessageHandler<
-    C: ProjectInfoCache + Send + Sync + 'static,
-    H: NotificationsCancellationHandler + Send + Sync + 'static,
-> {
+struct MessageHandler<C: ProjectInfoCache + Send + Sync + 'static> {
    cache: Arc<C>,
-    cancellation_handler: Arc<H>,
+    cancellation_handler: Arc<CancellationHandler<()>>,
    region_id: String,
 }

-impl<
-        C: ProjectInfoCache + Send + Sync + 'static,
-        H: NotificationsCancellationHandler + Send + Sync + 'static,
-    > MessageHandler<C, H>
-{
-    pub fn new(cache: Arc<C>, cancellation_handler: Arc<H>, region_id: String) -> Self {
+impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
+    pub fn new(
+        cache: Arc<C>,
+        cancellation_handler: Arc<CancellationHandler<()>>,
+        region_id: String,
+    ) -> Self {
        Self {
            cache,
            cancellation_handler,
@@ -139,7 +127,7 @@ impl<
                // This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message.
                match self
                    .cancellation_handler
-                    .cancel_session_no_publish(cancel_session.cancel_key_data)
+                    .cancel_session(cancel_session.cancel_key_data, uuid::Uuid::nil())
                    .await
                {
                    Ok(()) => {}
@@ -182,7 +170,7 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
 /// Handle console's invalidation messages.
 #[tracing::instrument(name = "console_notifications", skip_all)]
 pub async fn task_main<C>(
-    url: String,
+    redis: ConnectionWithCredentialsProvider,
    cache: Arc<C>,
    cancel_map: CancelMap,
    region_id: String,
@@ -193,13 +181,15 @@ where
    cache.enable_ttl();
    let handler = MessageHandler::new(
        cache,
-        Arc::new(CancellationHandler::new(cancel_map, None)),
+        Arc::new(CancellationHandler::<()>::new(
+            cancel_map,
+            NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS,
+        )),
        region_id,
    );

    loop {
-        let redis = RedisConsumerClient::new(&url)?;
-        let conn = match redis.try_connect().await {
+        let mut conn = match try_connect(&redis).await {
            Ok(conn) => {
                handler.disable_ttl();
                conn
@@ -212,7 +202,7 @@ where
                continue;
            }
        };
-        let mut stream = conn.into_on_message();
+        let mut stream = conn.on_message();
        while let Some(msg) = stream.next().await {
            match handler.handle_message(msg).await {
                Ok(()) => {}
--- a/proxy/src/redis/publisher.rs
+++ b/proxy/src/redis/publisher.rs
@@ -1,80 +0,0 @@
-use pq_proto::CancelKeyData;
-use redis::AsyncCommands;
-use uuid::Uuid;
-
-use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
-
-use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME};
-
-pub struct RedisPublisherClient {
-    client: redis::Client,
-    publisher: Option<redis::aio::Connection>,
-    region_id: String,
-    limiter: RedisRateLimiter,
-}
-
-impl RedisPublisherClient {
-    pub fn new(
-        url: &str,
-        region_id: String,
-        info: &'static [RateBucketInfo],
-    ) -> anyhow::Result<Self> {
-        let client = redis::Client::open(url)?;
-        Ok(Self {
-            client,
-            publisher: None,
-            region_id,
-            limiter: RedisRateLimiter::new(info),
-        })
-    }
-    pub async fn try_publish(
-        &mut self,
-        cancel_key_data: CancelKeyData,
-        session_id: Uuid,
-    ) -> anyhow::Result<()> {
-        if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping cancellation message");
-            return Err(anyhow::anyhow!("Rate limit exceeded"));
-        }
-        match self.publish(cancel_key_data, session_id).await {
-            Ok(()) => return Ok(()),
-            Err(e) => {
-                tracing::error!("failed to publish a message: {e}");
-                self.publisher = None;
-            }
-        }
-        tracing::info!("Publisher is disconnected. Reconnectiong...");
-        self.try_connect().await?;
-        self.publish(cancel_key_data, session_id).await
-    }
-
-    async fn publish(
-        &mut self,
-        cancel_key_data: CancelKeyData,
-        session_id: Uuid,
-    ) -> anyhow::Result<()> {
-        let conn = self
-            .publisher
-            .as_mut()
-            .ok_or_else(|| anyhow::anyhow!("not connected"))?;
-        let payload = serde_json::to_string(&Notification::Cancel(CancelSession {
-            region_id: Some(self.region_id.clone()),
-            cancel_key_data,
-            session_id,
-        }))?;
-        conn.publish(PROXY_CHANNEL_NAME, payload).await?;
-        Ok(())
-    }
-    pub async fn try_connect(&mut self) -> anyhow::Result<()> {
-        match self.client.get_async_connection().await {
-            Ok(conn) => {
-                self.publisher = Some(conn);
-            }
-            Err(e) => {
-                tracing::error!("failed to connect to redis: {e}");
-                return Err(e.into());
-            }
-        }
-        Ok(())
-    }
-}
--- a/proxy/src/sasl.rs
+++ b/proxy/src/sasl.rs
@@ -33,6 +33,9 @@ pub enum Error {
    #[error("Internal error: missing digest")]
    MissingBinding,

+    #[error("could not decode salt: {0}")]
+    Base64(#[from] base64::DecodeError),
+
    #[error(transparent)]
    Io(#[from] io::Error),
 }
@@ -55,6 +58,7 @@ impl ReportableError for Error {
            Error::ChannelBindingBadMethod(_) => crate::error::ErrorKind::User,
            Error::BadClientMessage(_) => crate::error::ErrorKind::User,
            Error::MissingBinding => crate::error::ErrorKind::Service,
+            Error::Base64(_) => crate::error::ErrorKind::ControlPlane,
            Error::Io(_) => crate::error::ErrorKind::ClientDisconnect,
        }
    }
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -56,8 +56,6 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {

 #[cfg(test)]
 mod tests {
-    use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
-
    use crate::sasl::{Mechanism, Step};

    use super::{Exchange, ServerSecret};
@@ -113,17 +111,11 @@ mod tests {
        );
    }

-    fn run_round_trip_test(server_password: &str, client_password: &str) {
-        let scram_secret = ServerSecret::build(server_password).unwrap();
-        let sasl_client =
-            ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported());
-
-        let outcome = super::exchange(
-            &scram_secret,
-            sasl_client,
-            crate::config::TlsServerEndPoint::Undefined,
-        )
-        .unwrap();
+    async fn run_round_trip_test(server_password: &str, client_password: &str) {
+        let scram_secret = ServerSecret::build(server_password).await.unwrap();
+        let outcome = super::exchange(&scram_secret, client_password.as_bytes())
+            .await
+            .unwrap();

        match outcome {
            crate::sasl::Outcome::Success(_) => {}
@@ -131,14 +123,14 @@ mod tests {
        }
    }

-    #[test]
-    fn round_trip() {
-        run_round_trip_test("pencil", "pencil")
+    #[tokio::test]
+    async fn round_trip() {
+        run_round_trip_test("pencil", "pencil").await
    }

-    #[test]
+    #[tokio::test]
    #[should_panic(expected = "password doesn't match")]
-    fn failure() {
-        run_round_trip_test("pencil", "eraser")
+    async fn failure() {
+        run_round_trip_test("pencil", "eraser").await
    }
 }
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -2,13 +2,16 @@

 use std::convert::Infallible;

-use postgres_protocol::authentication::sasl::ScramSha256;
+use hmac::{Hmac, Mac};
+use sha2::Sha256;
+use tokio::task::yield_now;

 use super::messages::{
    ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN,
 };
 use super::secret::ServerSecret;
 use super::signature::SignatureBuilder;
+use super::ScramKey;
 use crate::config;
 use crate::sasl::{self, ChannelBinding, Error as SaslError};

@@ -71,40 +74,62 @@ impl<'a> Exchange<'a> {
    }
 }

-pub fn exchange(
+// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
+async fn pbkdf2(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] {
+    let hmac = Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
+    let mut prev = hmac
+        .clone()
+        .chain_update(salt)
+        .chain_update(1u32.to_be_bytes())
+        .finalize()
+        .into_bytes();
+
+    let mut hi = prev;
+
+    for i in 1..iterations {
+        prev = hmac.clone().chain_update(prev).finalize().into_bytes();
+
+        for (hi, prev) in hi.iter_mut().zip(prev) {
+            *hi ^= prev;
+        }
+        // yield every ~250us
+        // hopefully reduces tail latencies
+        if i % 1024 == 0 {
+            yield_now().await
+        }
+    }
+
+    hi.into()
+}
+
+// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L236-L248>
+async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> ScramKey {
+    let salted_password = pbkdf2(password, salt, iterations).await;
+
+    let make_key = |name| {
+        let key = Hmac::<Sha256>::new_from_slice(&salted_password)
+            .expect("HMAC is able to accept all key sizes")
+            .chain_update(name)
+            .finalize();
+
+        <[u8; 32]>::from(key.into_bytes())
+    };
+
+    make_key(b"Client Key").into()
+}
+
+pub async fn exchange(
    secret: &ServerSecret,
-    mut client: ScramSha256,
-    tls_server_end_point: config::TlsServerEndPoint,
+    password: &[u8],
 ) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
-    use sasl::Step::*;
+    let salt = base64::decode(&secret.salt_base64)?;
+    let client_key = derive_client_key(password, &salt, secret.iterations).await;

-    let init = SaslInitial {
-        nonce: rand::random,
-    };
-
-    let client_first = std::str::from_utf8(client.message())
-        .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
-    let sent = match init.transition(secret, &tls_server_end_point, client_first)? {
-        Continue(sent, server_first) => {
-            client.update(server_first.as_bytes())?;
-            sent
-        }
-        Success(x, _) => match x {},
-        Failure(msg) => return Ok(sasl::Outcome::Failure(msg)),
-    };
-
-    let client_final = std::str::from_utf8(client.message())
-        .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
-    let keys = match sent.transition(secret, &tls_server_end_point, client_final)? {
-        Success(keys, server_final) => {
-            client.finish(server_final.as_bytes())?;
-            keys
-        }
-        Continue(x, _) => match x {},
-        Failure(msg) => return Ok(sasl::Outcome::Failure(msg)),
-    };
-
-    Ok(sasl::Outcome::Success(keys))
+    if secret.is_password_invalid(&client_key).into() {
+        Ok(sasl::Outcome::Failure("password doesn't match"))
+    } else {
+        Ok(sasl::Outcome::Success(client_key))
+    }
 }

 impl SaslInitial {
@@ -185,7 +210,7 @@ impl SaslSentInner {
            .derive_client_key(&client_final_message.proof);

        // Auth fails either if keys don't match or it's pre-determined to fail.
-        if client_key.sha256() != secret.stored_key || secret.doomed {
+        if secret.is_password_invalid(&client_key).into() {
            return Ok(sasl::Step::Failure("password doesn't match"));
        }

--- a/Show More
+++ b/Show More