diff --git a/CODEOWNERS b/CODEOWNERS
index 5b601f0566..9a23e8c958 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,12 +1,13 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
 /control_plane/attachment_service @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute
+/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/safekeepers
 /libs/vm_monitor/ @neondatabase/autoscaling
 /pageserver/ @neondatabase/storage
 /pgxn/ @neondatabase/compute
+/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
 /proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/safekeepers
 /vendor/ @neondatabase/compute
diff --git a/Cargo.lock b/Cargo.lock
index 625958fdb8..9930a6c323 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,6 +277,7 @@ dependencies = [
  "anyhow",
  "aws-config",
  "aws-sdk-secretsmanager",
+ "bytes",
  "camino",
  "clap",
  "control_plane",
@@ -288,6 +289,8 @@ dependencies = [
  "hex",
  "humantime",
  "hyper",
+ "lasso",
+ "measured",
  "metrics",
  "once_cell",
  "pageserver_api",
@@ -295,6 +298,7 @@ dependencies = [
  "postgres_connection",
  "r2d2",
  "reqwest",
+ "routerify",
  "serde",
  "serde_json",
  "thiserror",
@@ -2880,6 +2884,35 @@ version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
 
+[[package]]
+name = "measured"
+version = "0.0.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f"
+dependencies = [
+ "bytes",
+ "hashbrown 0.14.0",
+ "itoa",
+ "lasso",
+ "measured-derive",
+ "memchr",
+ "parking_lot 0.12.1",
+ "rustc-hash",
+ "ryu",
+]
+
+[[package]]
+name = "measured-derive"
+version = "0.0.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.52",
+]
+
 [[package]]
 name = "memchr"
 version = "2.6.4"
@@ -3901,7 +3934,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -3914,7 +3947,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "native-tls",
  "tokio",
@@ -3925,7 +3958,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -3938,12 +3971,13 @@ dependencies = [
  "rand 0.8.5",
  "sha2",
  "stringprep",
+ "tokio",
 ]
 
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4225,6 +4259,7 @@ dependencies = [
  "smallvec",
  "smol_str",
  "socket2 0.5.5",
+ "subtle",
  "sync_wrapper",
  "task-local-extensions",
  "thiserror",
@@ -5945,7 +5980,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "async-trait",
  "byteorder",
diff --git a/Cargo.toml b/Cargo.toml
index 3f98c1946c..65308db3c1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -101,6 +101,7 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
+measured = { version = "0.0.13", features=["default", "lasso"] }
 memoffset = "0.8"
 native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
@@ -148,6 +149,7 @@ smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
+"subtle"  = "2.5.0"
 svg_fmt = "0.4.1"
 sync_wrapper = "0.1.2"
 tar = "0.4"
diff --git a/compute_tools/README.md b/compute_tools/README.md
index 22a7de7cb7..8d84031efc 100644
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -32,6 +32,29 @@ compute_ctl -D /var/db/postgres/compute \
             -b /usr/local/bin/postgres
 ```
 
+## State Diagram
+
+Computes can be in various states. Below is a diagram that details how a
+compute moves between states.
+
+```mermaid
+%% https://mermaid.js.org/syntax/stateDiagram.html
+stateDiagram-v2
+  [*] --> Empty : Compute spawned
+  Empty --> ConfigurationPending : Waiting for compute spec
+  ConfigurationPending --> Configuration : Received compute spec
+  Configuration --> Failed : Failed to configure the compute
+  Configuration --> Running : Compute has been configured
+  Empty --> Init : Compute spec is immediately available
+  Empty --> TerminationPending : Requested termination
+  Init --> Failed : Failed to start Postgres
+  Init --> Running : Started Postgres
+  Running --> TerminationPending : Requested termination
+  TerminationPending --> Terminated : Terminated compute
+  Failed --> [*] : Compute exited
+  Terminated --> [*] : Compute exited
+```
+
 ## Tests
 
 Cargo formatter:
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index f78f56c480..34882659e3 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -17,6 +17,7 @@ testing = []
 anyhow.workspace = true
 aws-config.workspace = true
 aws-sdk-secretsmanager.workspace = true
+bytes.workspace = true
 camino.workspace = true
 clap.workspace = true
 fail.workspace = true
@@ -25,17 +26,20 @@ git-version.workspace = true
 hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
+lasso.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
 reqwest.workspace = true
+routerify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
+measured.workspace = true
 
 diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 076b3a2f70..036019cd38 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,5 +1,11 @@
+use crate::metrics::{
+    HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
+    METRICS_REGISTRY,
+};
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
+use futures::Future;
+use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
@@ -34,6 +40,8 @@ use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
 
 use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
 
+use routerify::Middleware;
+
 /// State available to HTTP request handlers
 #[derive(Clone)]
 pub struct HttpState {
@@ -313,7 +321,7 @@ async fn handle_tenant_timeline_passthrough(
     tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
 
     // Find the node that holds shard zero
-    let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;
+    let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id)?;
 
     // Callers will always pass an unsharded tenant ID.  Before proxying, we must
     // rewrite this to a shard-aware shard zero ID.
@@ -322,12 +330,39 @@ async fn handle_tenant_timeline_passthrough(
     let tenant_shard_str = format!("{}", tenant_shard_id);
     let path = path.replace(&tenant_str, &tenant_shard_str);
 
-    let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
+    let latency = &METRICS_REGISTRY
+        .metrics_group
+        .storage_controller_passthrough_request_latency;
+
+    // This is a bit awkward. We remove the param from the request
+    // and join the words by '_' to get a label for the request.
+    let just_path = path.replace(&tenant_shard_str, "");
+    let path_label = just_path
+        .split('/')
+        .filter(|token| !token.is_empty())
+        .collect::<Vec<_>>()
+        .join("_");
+    let labels = PageserverRequestLabelGroup {
+        pageserver_id: &node.get_id().to_string(),
+        path: &path_label,
+        method: crate::metrics::Method::Get,
+    };
+
+    let _timer = latency.start_timer(labels.clone());
+
+    let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref());
     let resp = client.get_raw(path).await.map_err(|_e|
         // FIXME: give APiError a proper Unavailable variant.  We return 503 here because
         // if we can't successfully send a request to the pageserver, we aren't available.
         ApiError::ShuttingDown)?;
 
+    if !resp.status().is_success() {
+        let error_counter = &METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_passthrough_request_error;
+        error_counter.inc(labels);
+    }
+
     // We have a reqest::Response, would like a http::Response
     let mut builder = hyper::Response::builder()
         .status(resp.status())
@@ -498,7 +533,11 @@ impl From<ReconcileError> for ApiError {
 
 /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
 /// be allowed to run if Service has finished its initial reconciliation.
-async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
+async fn tenant_service_handler<R, H>(
+    request: Request<Body>,
+    handler: H,
+    request_name: RequestName,
+) -> R::Output
 where
     R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
     H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
@@ -518,9 +557,10 @@ where
         ));
     }
 
-    request_span(
+    named_request_span(
         request,
         |request| async move { handler(service, request).await },
+        request_name,
     )
     .await
 }
@@ -531,11 +571,98 @@ fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(
     })
 }
 
+#[derive(Clone, Debug)]
+struct RequestMeta {
+    method: hyper::http::Method,
+    at: Instant,
+}
+
+fn prologue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
+) -> Middleware<B, ApiError> {
+    Middleware::pre(move |req| async move {
+        let meta = RequestMeta {
+            method: req.method().clone(),
+            at: Instant::now(),
+        };
+
+        req.set_context(meta);
+
+        Ok(req)
+    })
+}
+
+fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
+) -> Middleware<B, ApiError> {
+    Middleware::post_with_info(move |resp, req_info| async move {
+        let request_name = match req_info.context::<RequestName>() {
+            Some(name) => name,
+            None => {
+                return Ok(resp);
+            }
+        };
+
+        if let Some(meta) = req_info.context::<RequestMeta>() {
+            let status = &crate::metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_http_request_status;
+            let latency = &crate::metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_http_request_latency;
+
+            status.inc(HttpRequestStatusLabelGroup {
+                path: request_name.0,
+                method: meta.method.clone().into(),
+                status: crate::metrics::StatusCode(resp.status()),
+            });
+
+            latency.observe(
+                HttpRequestLatencyLabelGroup {
+                    path: request_name.0,
+                    method: meta.method.into(),
+                },
+                meta.at.elapsed().as_secs_f64(),
+            );
+        }
+        Ok(resp)
+    })
+}
+
+pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
+
+    let payload = crate::metrics::METRICS_REGISTRY.encode();
+    let response = Response::builder()
+        .status(200)
+        .header(CONTENT_TYPE, TEXT_FORMAT)
+        .body(payload.into())
+        .unwrap();
+
+    Ok(response)
+}
+
+#[derive(Clone)]
+struct RequestName(&'static str);
+
+async fn named_request_span<R, H>(
+    request: Request<Body>,
+    handler: H,
+    name: RequestName,
+) -> R::Output
+where
+    R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
+    H: FnOnce(Request<Body>) -> R + Send + Sync + 'static,
+{
+    request.set_context(name);
+    request_span(request, handler).await
+}
+
 pub fn make_router(
     service: Arc<Service>,
     auth: Option<Arc<SwappableJwtAuth>>,
 ) -> RouterBuilder<hyper::Body, ApiError> {
-    let mut router = endpoint::make_router();
+    let mut router = endpoint::make_router()
+        .middleware(prologue_metrics_middleware())
+        .middleware(epilogue_metrics_middleware());
     if auth.is_some() {
         router = router.middleware(auth_middleware(|request| {
             let state = get_state(request);
@@ -544,99 +671,166 @@ pub fn make_router(
             } else {
                 state.auth.as_deref()
             }
-        }))
+        }));
     }
 
     router
         .data(Arc::new(HttpState::new(service, auth)))
+        .get("/metrics", |r| {
+            named_request_span(r, measured_metrics_handler, RequestName("metrics"))
+        })
         // Non-prefixed generic endpoints (status, metrics)
-        .get("/status", |r| request_span(r, handle_status))
-        .get("/ready", |r| request_span(r, handle_ready))
+        .get("/status", |r| {
+            named_request_span(r, handle_status, RequestName("status"))
+        })
+        .get("/ready", |r| {
+            named_request_span(r, handle_ready, RequestName("ready"))
+        })
         // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
         .post("/upcall/v1/re-attach", |r| {
-            request_span(r, handle_re_attach)
+            named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach"))
+        })
+        .post("/upcall/v1/validate", |r| {
+            named_request_span(r, handle_validate, RequestName("upcall_v1_validate"))
         })
-        .post("/upcall/v1/validate", |r| request_span(r, handle_validate))
         // Test/dev/debug endpoints
         .post("/debug/v1/attach-hook", |r| {
-            request_span(r, handle_attach_hook)
+            named_request_span(r, handle_attach_hook, RequestName("debug_v1_attach_hook"))
+        })
+        .post("/debug/v1/inspect", |r| {
+            named_request_span(r, handle_inspect, RequestName("debug_v1_inspect"))
         })
-        .post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
         .post("/debug/v1/tenant/:tenant_id/drop", |r| {
-            request_span(r, handle_tenant_drop)
+            named_request_span(r, handle_tenant_drop, RequestName("debug_v1_tenant_drop"))
         })
         .post("/debug/v1/node/:node_id/drop", |r| {
-            request_span(r, handle_node_drop)
+            named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
+        })
+        .get("/debug/v1/tenant", |r| {
+            named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
         })
-        .get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
         .get("/debug/v1/tenant/:tenant_id/locate", |r| {
-            tenant_service_handler(r, handle_tenant_locate)
+            tenant_service_handler(
+                r,
+                handle_tenant_locate,
+                RequestName("debug_v1_tenant_locate"),
+            )
         })
         .get("/debug/v1/scheduler", |r| {
-            request_span(r, handle_scheduler_dump)
+            named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
         })
         .post("/debug/v1/consistency_check", |r| {
-            request_span(r, handle_consistency_check)
+            named_request_span(
+                r,
+                handle_consistency_check,
+                RequestName("debug_v1_consistency_check"),
+            )
         })
         .put("/debug/v1/failpoints", |r| {
             request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
         })
         // Node operations
         .post("/control/v1/node", |r| {
-            request_span(r, handle_node_register)
+            named_request_span(r, handle_node_register, RequestName("control_v1_node"))
+        })
+        .get("/control/v1/node", |r| {
+            named_request_span(r, handle_node_list, RequestName("control_v1_node"))
         })
-        .get("/control/v1/node", |r| request_span(r, handle_node_list))
         .put("/control/v1/node/:node_id/config", |r| {
-            request_span(r, handle_node_configure)
+            named_request_span(
+                r,
+                handle_node_configure,
+                RequestName("control_v1_node_config"),
+            )
         })
         // Tenant Shard operations
         .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
-            tenant_service_handler(r, handle_tenant_shard_migrate)
+            tenant_service_handler(
+                r,
+                handle_tenant_shard_migrate,
+                RequestName("control_v1_tenant_migrate"),
+            )
         })
         .put("/control/v1/tenant/:tenant_id/shard_split", |r| {
-            tenant_service_handler(r, handle_tenant_shard_split)
+            tenant_service_handler(
+                r,
+                handle_tenant_shard_split,
+                RequestName("control_v1_tenant_shard_split"),
+            )
         })
         .get("/control/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(r, handle_tenant_describe)
+            tenant_service_handler(
+                r,
+                handle_tenant_describe,
+                RequestName("control_v1_tenant_describe"),
+            )
         })
         // Tenant operations
         // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
         // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
         .post("/v1/tenant", |r| {
-            tenant_service_handler(r, handle_tenant_create)
+            tenant_service_handler(r, handle_tenant_create, RequestName("v1_tenant"))
         })
         .delete("/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(r, handle_tenant_delete)
+            tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant"))
         })
         .put("/v1/tenant/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_set)
+            tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config"))
         })
         .get("/v1/tenant/:tenant_id/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_get)
+            tenant_service_handler(r, handle_tenant_config_get, RequestName("v1_tenant_config"))
         })
         .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
-            tenant_service_handler(r, handle_tenant_location_config)
+            tenant_service_handler(
+                r,
+                handle_tenant_location_config,
+                RequestName("v1_tenant_location_config"),
+            )
         })
         .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
-            tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
+            tenant_service_handler(
+                r,
+                handle_tenant_time_travel_remote_storage,
+                RequestName("v1_tenant_time_travel_remote_storage"),
+            )
         })
         .post("/v1/tenant/:tenant_id/secondary/download", |r| {
-            tenant_service_handler(r, handle_tenant_secondary_download)
+            tenant_service_handler(
+                r,
+                handle_tenant_secondary_download,
+                RequestName("v1_tenant_secondary_download"),
+            )
         })
         // Timeline operations
         .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            tenant_service_handler(r, handle_tenant_timeline_delete)
+            tenant_service_handler(
+                r,
+                handle_tenant_timeline_delete,
+                RequestName("v1_tenant_timeline"),
+            )
         })
         .post("/v1/tenant/:tenant_id/timeline", |r| {
-            tenant_service_handler(r, handle_tenant_timeline_create)
+            tenant_service_handler(
+                r,
+                handle_tenant_timeline_create,
+                RequestName("v1_tenant_timeline"),
+            )
         })
         // Tenant detail GET passthrough to shard zero
         .get("/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(r, handle_tenant_timeline_passthrough)
+            tenant_service_handler(
+                r,
+                handle_tenant_timeline_passthrough,
+                RequestName("v1_tenant_passthrough"),
+            )
         })
         // Timeline GET passthrough to shard zero.  Note that the `*` in the URL is a wildcard: any future
         // timeline GET APIs will be implicitly included.
         .get("/v1/tenant/:tenant_id/timeline*", |r| {
-            tenant_service_handler(r, handle_tenant_timeline_passthrough)
+            tenant_service_handler(
+                r,
+                handle_tenant_timeline_passthrough,
+                RequestName("v1_tenant_timeline_passthrough"),
+            )
         })
 }
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index 4aff29f15b..8bcd5c0ac4 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -8,6 +8,7 @@ pub mod http;
 mod id_lock_map;
 pub mod metrics;
 mod node;
+mod pageserver_client;
 pub mod persistence;
 mod reconciler;
 mod scheduler;
diff --git a/control_plane/attachment_service/src/metrics.rs b/control_plane/attachment_service/src/metrics.rs
index ffe093b9c8..ccf5e9b07c 100644
--- a/control_plane/attachment_service/src/metrics.rs
+++ b/control_plane/attachment_service/src/metrics.rs
@@ -1,32 +1,284 @@
-use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
+//!
+//! This module provides metric definitions for the storage controller.
+//!
+//! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds
+//! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`]
+//! constant.
+//!
+//! The rest of the code defines label group types and deals with converting outer types to labels.
+//!
+use bytes::Bytes;
+use measured::{
+    label::{LabelValue, StaticLabelSet},
+    FixedCardinalityLabel, MetricGroup,
+};
 use once_cell::sync::Lazy;
+use std::sync::Mutex;
 
-pub(crate) struct ReconcilerMetrics {
-    pub(crate) spawned: IntCounter,
-    pub(crate) complete: IntCounterVec,
-}
+use crate::persistence::{DatabaseError, DatabaseOperation};
 
-impl ReconcilerMetrics {
-    // Labels used on [`Self::complete`]
-    pub(crate) const SUCCESS: &'static str = "ok";
-    pub(crate) const ERROR: &'static str = "success";
-    pub(crate) const CANCEL: &'static str = "cancel";
-}
-
-pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
-    spawned: register_int_counter!(
-        "storage_controller_reconcile_spawn",
-        "Count of how many times we spawn a reconcile task",
-    )
-    .expect("failed to define a metric"),
-    complete: register_int_counter_vec!(
-        "storage_controller_reconcile_complete",
-        "Reconciler tasks completed, broken down by success/failure/cancelled",
-        &["status"],
-    )
-    .expect("failed to define a metric"),
-});
+pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
+    Lazy::new(StorageControllerMetrics::default);
 
 pub fn preinitialize_metrics() {
-    Lazy::force(&RECONCILER);
+    Lazy::force(&METRICS_REGISTRY);
+}
+
+pub(crate) struct StorageControllerMetrics {
+    pub(crate) metrics_group: StorageControllerMetricGroup,
+    encoder: Mutex<measured::text::TextEncoder>,
+}
+
+#[derive(measured::MetricGroup)]
+pub(crate) struct StorageControllerMetricGroup {
+    /// Count of how many times we spawn a reconcile task
+    pub(crate) storage_controller_reconcile_spawn: measured::Counter,
+    /// Reconciler tasks completed, broken down by success/failure/cancelled
+    pub(crate) storage_controller_reconcile_complete:
+        measured::CounterVec<ReconcileCompleteLabelGroupSet>,
+
+    /// HTTP request status counters for handled requests
+    pub(crate) storage_controller_http_request_status:
+        measured::CounterVec<HttpRequestStatusLabelGroupSet>,
+    /// HTTP request handler latency across all status codes
+    pub(crate) storage_controller_http_request_latency:
+        measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
+
+    /// Count of HTTP requests to the pageserver that resulted in an error,
+    /// broken down by the pageserver node id, request name and method
+    pub(crate) storage_controller_pageserver_request_error:
+        measured::CounterVec<PageserverRequestLabelGroupSet>,
+
+    /// Latency of HTTP requests to the pageserver, broken down by pageserver
+    /// node id, request name and method. This include both successful and unsuccessful
+    /// requests.
+    pub(crate) storage_controller_pageserver_request_latency:
+        measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
+
+    /// Count of pass-through HTTP requests to the pageserver that resulted in an error,
+    /// broken down by the pageserver node id, request name and method
+    pub(crate) storage_controller_passthrough_request_error:
+        measured::CounterVec<PageserverRequestLabelGroupSet>,
+
+    /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
+    /// node id, request name and method. This include both successful and unsuccessful
+    /// requests.
+    pub(crate) storage_controller_passthrough_request_latency:
+        measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
+
+    /// Count of errors in database queries, broken down by error type and operation.
+    pub(crate) storage_controller_database_query_error:
+        measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
+
+    /// Latency of database queries, broken down by operation.
+    pub(crate) storage_controller_database_query_latency:
+        measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
+}
+
+impl StorageControllerMetrics {
+    pub(crate) fn encode(&self) -> Bytes {
+        let mut encoder = self.encoder.lock().unwrap();
+        self.metrics_group.collect_into(&mut *encoder);
+        encoder.finish()
+    }
+}
+
+impl Default for StorageControllerMetrics {
+    fn default() -> Self {
+        Self {
+            metrics_group: StorageControllerMetricGroup::new(),
+            encoder: Mutex::new(measured::text::TextEncoder::new()),
+        }
+    }
+}
+
+impl StorageControllerMetricGroup {
+    pub(crate) fn new() -> Self {
+        Self {
+            storage_controller_reconcile_spawn: measured::Counter::new(),
+            storage_controller_reconcile_complete: measured::CounterVec::new(
+                ReconcileCompleteLabelGroupSet {
+                    status: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_http_request_status: measured::CounterVec::new(
+                HttpRequestStatusLabelGroupSet {
+                    path: lasso::ThreadedRodeo::new(),
+                    method: StaticLabelSet::new(),
+                    status: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_http_request_latency: measured::HistogramVec::new(
+                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
+            ),
+            storage_controller_pageserver_request_error: measured::CounterVec::new(
+                PageserverRequestLabelGroupSet {
+                    pageserver_id: lasso::ThreadedRodeo::new(),
+                    path: lasso::ThreadedRodeo::new(),
+                    method: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_pageserver_request_latency: measured::HistogramVec::new(
+                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
+            ),
+            storage_controller_passthrough_request_error: measured::CounterVec::new(
+                PageserverRequestLabelGroupSet {
+                    pageserver_id: lasso::ThreadedRodeo::new(),
+                    path: lasso::ThreadedRodeo::new(),
+                    method: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_passthrough_request_latency: measured::HistogramVec::new(
+                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
+            ),
+            storage_controller_database_query_error: measured::CounterVec::new(
+                DatabaseQueryErrorLabelGroupSet {
+                    operation: StaticLabelSet::new(),
+                    error_type: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_database_query_latency: measured::HistogramVec::new(
+                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
+            ),
+        }
+    }
+}
+
+#[derive(measured::LabelGroup)]
+#[label(set = ReconcileCompleteLabelGroupSet)]
+pub(crate) struct ReconcileCompleteLabelGroup {
+    pub(crate) status: ReconcileOutcome,
+}
+
+#[derive(measured::LabelGroup)]
+#[label(set = HttpRequestStatusLabelGroupSet)]
+pub(crate) struct HttpRequestStatusLabelGroup<'a> {
+    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    pub(crate) path: &'a str,
+    pub(crate) method: Method,
+    pub(crate) status: StatusCode,
+}
+
+#[derive(measured::LabelGroup)]
+#[label(set = HttpRequestLatencyLabelGroupSet)]
+pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
+    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    pub(crate) path: &'a str,
+    pub(crate) method: Method,
+}
+
+impl Default for HttpRequestLatencyLabelGroupSet {
+    fn default() -> Self {
+        Self {
+            path: lasso::ThreadedRodeo::new(),
+            method: StaticLabelSet::new(),
+        }
+    }
+}
+
+#[derive(measured::LabelGroup, Clone)]
+#[label(set = PageserverRequestLabelGroupSet)]
+pub(crate) struct PageserverRequestLabelGroup<'a> {
+    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    pub(crate) pageserver_id: &'a str,
+    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    pub(crate) path: &'a str,
+    pub(crate) method: Method,
+}
+
+impl Default for PageserverRequestLabelGroupSet {
+    fn default() -> Self {
+        Self {
+            pageserver_id: lasso::ThreadedRodeo::new(),
+            path: lasso::ThreadedRodeo::new(),
+            method: StaticLabelSet::new(),
+        }
+    }
+}
+
+#[derive(measured::LabelGroup)]
+#[label(set = DatabaseQueryErrorLabelGroupSet)]
+pub(crate) struct DatabaseQueryErrorLabelGroup {
+    pub(crate) error_type: DatabaseErrorLabel,
+    pub(crate) operation: DatabaseOperation,
+}
+
+#[derive(measured::LabelGroup)]
+#[label(set = DatabaseQueryLatencyLabelGroupSet)]
+pub(crate) struct DatabaseQueryLatencyLabelGroup {
+    pub(crate) operation: DatabaseOperation,
+}
+
+#[derive(FixedCardinalityLabel)]
+pub(crate) enum ReconcileOutcome {
+    #[label(rename = "ok")]
+    Success,
+    Error,
+    Cancel,
+}
+
+#[derive(FixedCardinalityLabel, Clone)]
+pub(crate) enum Method {
+    Get,
+    Put,
+    Post,
+    Delete,
+    Other,
+}
+
+impl From<hyper::Method> for Method {
+    fn from(value: hyper::Method) -> Self {
+        if value == hyper::Method::GET {
+            Method::Get
+        } else if value == hyper::Method::PUT {
+            Method::Put
+        } else if value == hyper::Method::POST {
+            Method::Post
+        } else if value == hyper::Method::DELETE {
+            Method::Delete
+        } else {
+            Method::Other
+        }
+    }
+}
+
+pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
+
+impl LabelValue for StatusCode {
+    fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
+        v.write_int(self.0.as_u16() as u64)
+    }
+}
+
+impl FixedCardinalityLabel for StatusCode {
+    fn cardinality() -> usize {
+        (100..1000).len()
+    }
+
+    fn encode(&self) -> usize {
+        self.0.as_u16() as usize
+    }
+
+    fn decode(value: usize) -> Self {
+        Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap())
+    }
+}
+
+#[derive(FixedCardinalityLabel)]
+pub(crate) enum DatabaseErrorLabel {
+    Query,
+    Connection,
+    ConnectionPool,
+    Logical,
+}
+
+impl DatabaseError {
+    pub(crate) fn error_label(&self) -> DatabaseErrorLabel {
+        match self {
+            Self::Query(_) => DatabaseErrorLabel::Query,
+            Self::Connection(_) => DatabaseErrorLabel::Connection,
+            Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
+            Self::Logical(_) => DatabaseErrorLabel::Logical,
+        }
+    }
 }
diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs
index 4167782715..df40bff66f 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -12,7 +12,9 @@ use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use utils::{backoff, id::NodeId};
 
-use crate::{persistence::NodePersistence, scheduler::MaySchedule};
+use crate::{
+    pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule,
+};
 
 /// Represents the in-memory description of a Node.
 ///
@@ -202,7 +204,7 @@ impl Node {
         cancel: &CancellationToken,
     ) -> Option<mgmt_api::Result<T>>
     where
-        O: FnMut(mgmt_api::Client) -> F,
+        O: FnMut(PageserverClient) -> F,
         F: std::future::Future<Output = mgmt_api::Result<T>>,
     {
         fn is_fatal(e: &mgmt_api::Error) -> bool {
@@ -224,8 +226,12 @@ impl Node {
                     .build()
                     .expect("Failed to construct HTTP client");
 
-                let client =
-                    mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref());
+                let client = PageserverClient::from_client(
+                    self.get_id(),
+                    http_client,
+                    self.base_url(),
+                    jwt.as_deref(),
+                );
 
                 let node_cancel_fut = self.cancel.cancelled();
 
diff --git a/control_plane/attachment_service/src/pageserver_client.rs b/control_plane/attachment_service/src/pageserver_client.rs
new file mode 100644
index 0000000000..8237229d7b
--- /dev/null
+++ b/control_plane/attachment_service/src/pageserver_client.rs
@@ -0,0 +1,203 @@
+use pageserver_api::{
+    models::{
+        LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
+        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
+    },
+    shard::TenantShardId,
+};
+use pageserver_client::mgmt_api::{Client, Result};
+use reqwest::StatusCode;
+use utils::id::{NodeId, TimelineId};
+
+/// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
+/// controller to collect metrics in a non-intrusive manner.
+#[derive(Debug, Clone)]
+pub(crate) struct PageserverClient {
+    inner: Client,
+    node_id_label: String,
+}
+
+macro_rules! measured_request {
+    ($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{
+        let labels = crate::metrics::PageserverRequestLabelGroup {
+            pageserver_id: $node_id,
+            path: $name,
+            method: $method,
+        };
+
+        let latency = &crate::metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_pageserver_request_latency;
+        let _timer_guard = latency.start_timer(labels.clone());
+
+        let res = $invoke;
+
+        if res.is_err() {
+            let error_counters = &crate::metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_pageserver_request_error;
+            error_counters.inc(labels)
+        }
+
+        res
+    }};
+}
+
+impl PageserverClient {
+    pub(crate) fn new(node_id: NodeId, mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
+        Self {
+            inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt),
+            node_id_label: node_id.0.to_string(),
+        }
+    }
+
+    pub(crate) fn from_client(
+        node_id: NodeId,
+        raw_client: reqwest::Client,
+        mgmt_api_endpoint: String,
+        jwt: Option<&str>,
+    ) -> Self {
+        Self {
+            inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt),
+            node_id_label: node_id.0.to_string(),
+        }
+    }
+
+    pub(crate) async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result<StatusCode> {
+        measured_request!(
+            "tenant",
+            crate::metrics::Method::Delete,
+            &self.node_id_label,
+            self.inner.tenant_delete(tenant_shard_id).await
+        )
+    }
+
+    pub(crate) async fn tenant_time_travel_remote_storage(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timestamp: &str,
+        done_if_after: &str,
+    ) -> Result<()> {
+        measured_request!(
+            "tenant_time_travel_remote_storage",
+            crate::metrics::Method::Put,
+            &self.node_id_label,
+            self.inner
+                .tenant_time_travel_remote_storage(tenant_shard_id, timestamp, done_if_after)
+                .await
+        )
+    }
+
+    pub(crate) async fn tenant_secondary_download(
+        &self,
+        tenant_id: TenantShardId,
+        wait: Option<std::time::Duration>,
+    ) -> Result<(StatusCode, SecondaryProgress)> {
+        measured_request!(
+            "tenant_secondary_download",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner.tenant_secondary_download(tenant_id, wait).await
+        )
+    }
+
+    pub(crate) async fn location_config(
+        &self,
+        tenant_shard_id: TenantShardId,
+        config: LocationConfig,
+        flush_ms: Option<std::time::Duration>,
+        lazy: bool,
+    ) -> Result<()> {
+        measured_request!(
+            "location_config",
+            crate::metrics::Method::Put,
+            &self.node_id_label,
+            self.inner
+                .location_config(tenant_shard_id, config, flush_ms, lazy)
+                .await
+        )
+    }
+
+    pub(crate) async fn list_location_config(&self) -> Result<LocationConfigListResponse> {
+        measured_request!(
+            "location_configs",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner.list_location_config().await
+        )
+    }
+
+    pub(crate) async fn get_location_config(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<Option<LocationConfig>> {
+        measured_request!(
+            "location_config",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner.get_location_config(tenant_shard_id).await
+        )
+    }
+
+    pub(crate) async fn timeline_create(
+        &self,
+        tenant_shard_id: TenantShardId,
+        req: &TimelineCreateRequest,
+    ) -> Result<TimelineInfo> {
+        measured_request!(
+            "timeline",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner.timeline_create(tenant_shard_id, req).await
+        )
+    }
+
+    pub(crate) async fn timeline_delete(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<StatusCode> {
+        measured_request!(
+            "timeline",
+            crate::metrics::Method::Delete,
+            &self.node_id_label,
+            self.inner
+                .timeline_delete(tenant_shard_id, timeline_id)
+                .await
+        )
+    }
+
+    pub(crate) async fn tenant_shard_split(
+        &self,
+        tenant_shard_id: TenantShardId,
+        req: TenantShardSplitRequest,
+    ) -> Result<TenantShardSplitResponse> {
+        measured_request!(
+            "tenant_shard_split",
+            crate::metrics::Method::Put,
+            &self.node_id_label,
+            self.inner.tenant_shard_split(tenant_shard_id, req).await
+        )
+    }
+
+    pub(crate) async fn timeline_list(
+        &self,
+        tenant_shard_id: &TenantShardId,
+    ) -> Result<Vec<TimelineInfo>> {
+        measured_request!(
+            "timelines",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner.timeline_list(tenant_shard_id).await
+        )
+    }
+
+    pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
+        measured_request!(
+            "utilization",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner.get_utilization().await
+        )
+    }
+}
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 209d8ff075..dafd52017b 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -19,6 +19,9 @@ use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};
 
+use crate::metrics::{
+    DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY,
+};
 use crate::node::Node;
 
 /// ## What do we store?
@@ -75,6 +78,25 @@ pub(crate) enum DatabaseError {
     Logical(String),
 }
 
+#[derive(measured::FixedCardinalityLabel, Clone)]
+pub(crate) enum DatabaseOperation {
+    InsertNode,
+    UpdateNode,
+    DeleteNode,
+    ListNodes,
+    BeginShardSplit,
+    CompleteShardSplit,
+    AbortShardSplit,
+    Detach,
+    ReAttach,
+    IncrementGeneration,
+    ListTenantShards,
+    InsertTenantShards,
+    UpdateTenantShard,
+    DeleteTenant,
+    UpdateTenantConfig,
+}
+
 #[must_use]
 pub(crate) enum AbortShardSplitStatus {
     /// We aborted the split in the database by reverting to the parent shards
@@ -115,6 +137,34 @@ impl Persistence {
         }
     }
 
+    /// Wraps `with_conn` in order to collect latency and error metrics
+    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
+    where
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        R: Send + 'static,
+    {
+        let latency = &METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_database_query_latency;
+        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup {
+            operation: op.clone(),
+        });
+
+        let res = self.with_conn(func).await;
+
+        if let Err(err) = &res {
+            let error_counter = &METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_database_query_error;
+            error_counter.inc(DatabaseQueryErrorLabelGroup {
+                error_type: err.error_label(),
+                operation: op,
+            })
+        }
+
+        res
+    }
+
     /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
     async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
     where
@@ -130,21 +180,27 @@ impl Persistence {
     /// When a node is first registered, persist it before using it for anything
     pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
         let np = node.to_persistent();
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            diesel::insert_into(crate::schema::nodes::table)
-                .values(&np)
-                .execute(conn)?;
-            Ok(())
-        })
+        self.with_measured_conn(
+            DatabaseOperation::InsertNode,
+            move |conn| -> DatabaseResult<()> {
+                diesel::insert_into(crate::schema::nodes::table)
+                    .values(&np)
+                    .execute(conn)?;
+                Ok(())
+            },
+        )
         .await
     }
 
     /// At startup, populate the list of nodes which our shards may be placed on
     pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
         let nodes: Vec<NodePersistence> = self
-            .with_conn(move |conn| -> DatabaseResult<_> {
-                Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
-            })
+            .with_measured_conn(
+                DatabaseOperation::ListNodes,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
+                },
+            )
             .await?;
 
         tracing::info!("list_nodes: loaded {} nodes", nodes.len());
@@ -159,7 +215,7 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::nodes::dsl::*;
         let updated = self
-            .with_conn(move |conn| {
+            .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
                 let updated = diesel::update(nodes)
                     .filter(node_id.eq(input_node_id.0 as i64))
                     .set((scheduling_policy.eq(String::from(input_scheduling)),))
@@ -181,9 +237,12 @@ impl Persistence {
     /// be enriched at runtime with state discovered on pageservers.
     pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
         let loaded = self
-            .with_conn(move |conn| -> DatabaseResult<_> {
-                Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
-            })
+            .with_measured_conn(
+                DatabaseOperation::ListTenantShards,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
+                },
+            )
             .await?;
 
         if loaded.is_empty() {
@@ -260,17 +319,20 @@ impl Persistence {
         shards: Vec<TenantShardPersistence>,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            conn.transaction(|conn| -> QueryResult<()> {
-                for tenant in &shards {
-                    diesel::insert_into(tenant_shards)
-                        .values(tenant)
-                        .execute(conn)?;
-                }
+        self.with_measured_conn(
+            DatabaseOperation::InsertTenantShards,
+            move |conn| -> DatabaseResult<()> {
+                conn.transaction(|conn| -> QueryResult<()> {
+                    for tenant in &shards {
+                        diesel::insert_into(tenant_shards)
+                            .values(tenant)
+                            .execute(conn)?;
+                    }
+                    Ok(())
+                })?;
                 Ok(())
-            })?;
-            Ok(())
-        })
+            },
+        )
         .await
     }
 
@@ -278,25 +340,31 @@ impl Persistence {
     /// the tenant from memory on this server.
     pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            diesel::delete(tenant_shards)
-                .filter(tenant_id.eq(del_tenant_id.to_string()))
-                .execute(conn)?;
+        self.with_measured_conn(
+            DatabaseOperation::DeleteTenant,
+            move |conn| -> DatabaseResult<()> {
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(del_tenant_id.to_string()))
+                    .execute(conn)?;
 
-            Ok(())
-        })
+                Ok(())
+            },
+        )
         .await
     }
 
     pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
         use crate::schema::nodes::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            diesel::delete(nodes)
-                .filter(node_id.eq(del_node_id.0 as i64))
-                .execute(conn)?;
+        self.with_measured_conn(
+            DatabaseOperation::DeleteNode,
+            move |conn| -> DatabaseResult<()> {
+                diesel::delete(nodes)
+                    .filter(node_id.eq(del_node_id.0 as i64))
+                    .execute(conn)?;
 
-            Ok(())
-        })
+                Ok(())
+            },
+        )
         .await
     }
 
@@ -310,7 +378,7 @@ impl Persistence {
     ) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
-            .with_conn(move |conn| {
+            .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
                 let rows_updated = diesel::update(tenant_shards)
                     .filter(generation_pageserver.eq(node_id.0 as i64))
                     .set(generation.eq(generation + 1))
@@ -360,7 +428,7 @@ impl Persistence {
     ) -> anyhow::Result<Generation> {
         use crate::schema::tenant_shards::dsl::*;
         let updated = self
-            .with_conn(move |conn| {
+            .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
                 let updated = diesel::update(tenant_shards)
                     .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                     .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -404,7 +472,7 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
 
-        self.with_conn(move |conn| {
+        self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
             let query = diesel::update(tenant_shards)
                 .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                 .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -445,7 +513,7 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
 
-        self.with_conn(move |conn| {
+        self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| {
             diesel::update(tenant_shards)
                 .filter(tenant_id.eq(input_tenant_id.to_string()))
                 .set((config.eq(serde_json::to_string(&input_config).unwrap()),))
@@ -460,7 +528,7 @@ impl Persistence {
 
     pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| {
+        self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
             let updated = diesel::update(tenant_shards)
                 .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                 .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -490,7 +558,7 @@ impl Persistence {
         parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
             conn.transaction(|conn| -> DatabaseResult<()> {
                 // Mark parent shards as splitting
 
@@ -554,26 +622,29 @@ impl Persistence {
         old_shard_count: ShardCount,
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            conn.transaction(|conn| -> QueryResult<()> {
-                // Drop parent shards
-                diesel::delete(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.eq(old_shard_count.literal() as i32))
-                    .execute(conn)?;
+        self.with_measured_conn(
+            DatabaseOperation::CompleteShardSplit,
+            move |conn| -> DatabaseResult<()> {
+                conn.transaction(|conn| -> QueryResult<()> {
+                    // Drop parent shards
+                    diesel::delete(tenant_shards)
+                        .filter(tenant_id.eq(split_tenant_id.to_string()))
+                        .filter(shard_count.eq(old_shard_count.literal() as i32))
+                        .execute(conn)?;
 
-                // Clear sharding flag
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .set((splitting.eq(0),))
-                    .execute(conn)?;
-                debug_assert!(updated > 0);
+                    // Clear sharding flag
+                    let updated = diesel::update(tenant_shards)
+                        .filter(tenant_id.eq(split_tenant_id.to_string()))
+                        .set((splitting.eq(0),))
+                        .execute(conn)?;
+                    debug_assert!(updated > 0);
+
+                    Ok(())
+                })?;
 
                 Ok(())
-            })?;
-
-            Ok(())
-        })
+            },
+        )
         .await
     }
 
@@ -585,40 +656,44 @@ impl Persistence {
         new_shard_count: ShardCount,
     ) -> DatabaseResult<AbortShardSplitStatus> {
         use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<AbortShardSplitStatus> {
-            let aborted = conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
-                // Clear the splitting state on parent shards
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.ne(new_shard_count.literal() as i32))
-                    .set((splitting.eq(0),))
-                    .execute(conn)?;
+        self.with_measured_conn(
+            DatabaseOperation::AbortShardSplit,
+            move |conn| -> DatabaseResult<AbortShardSplitStatus> {
+                let aborted =
+                    conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
+                        // Clear the splitting state on parent shards
+                        let updated = diesel::update(tenant_shards)
+                            .filter(tenant_id.eq(split_tenant_id.to_string()))
+                            .filter(shard_count.ne(new_shard_count.literal() as i32))
+                            .set((splitting.eq(0),))
+                            .execute(conn)?;
 
-                // Parent shards are already gone: we cannot abort.
-                if updated == 0 {
-                    return Ok(AbortShardSplitStatus::Complete);
-                }
+                        // Parent shards are already gone: we cannot abort.
+                        if updated == 0 {
+                            return Ok(AbortShardSplitStatus::Complete);
+                        }
 
-                // Sanity check: if parent shards were present, their cardinality should
-                // be less than the number of child shards.
-                if updated >= new_shard_count.count() as usize {
-                    return Err(DatabaseError::Logical(format!(
-                        "Unexpected parent shard count {updated} while aborting split to \
+                        // Sanity check: if parent shards were present, their cardinality should
+                        // be less than the number of child shards.
+                        if updated >= new_shard_count.count() as usize {
+                            return Err(DatabaseError::Logical(format!(
+                                "Unexpected parent shard count {updated} while aborting split to \
                             count {new_shard_count:?} on tenant {split_tenant_id}"
-                    )));
-                }
+                            )));
+                        }
 
-                // Erase child shards
-                diesel::delete(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.eq(new_shard_count.literal() as i32))
-                    .execute(conn)?;
+                        // Erase child shards
+                        diesel::delete(tenant_shards)
+                            .filter(tenant_id.eq(split_tenant_id.to_string()))
+                            .filter(shard_count.eq(new_shard_count.literal() as i32))
+                            .execute(conn)?;
 
-                Ok(AbortShardSplitStatus::Aborted)
-            })?;
+                        Ok(AbortShardSplitStatus::Aborted)
+                    })?;
 
-            Ok(aborted)
-        })
+                Ok(aborted)
+            },
+        )
         .await
     }
 }
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index f00f35c74b..a62357f9ac 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,3 +1,4 @@
+use crate::pageserver_client::PageserverClient;
 use crate::persistence::Persistence;
 use crate::service;
 use hyper::StatusCode;
@@ -117,6 +118,15 @@ impl Reconciler {
         flush_ms: Option<Duration>,
         lazy: bool,
     ) -> Result<(), ReconcileError> {
+        if !node.is_available() && config.mode == LocationConfigMode::Detached {
+            // Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline
+            // will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of
+            // what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`]
+            tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation");
+            self.observed.locations.remove(&node.get_id());
+            return Ok(());
+        }
+
         self.observed
             .locations
             .insert(node.get_id(), ObservedStateLocation { conf: None });
@@ -149,9 +159,16 @@ impl Reconciler {
         };
         tracing::info!("location_config({node}) complete: {:?}", config);
 
-        self.observed
-            .locations
-            .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
+        match config.mode {
+            LocationConfigMode::Detached => {
+                self.observed.locations.remove(&node.get_id());
+            }
+            _ => {
+                self.observed
+                    .locations
+                    .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
+            }
+        }
 
         Ok(())
     }
@@ -243,8 +260,11 @@ impl Reconciler {
         tenant_shard_id: TenantShardId,
         node: &Node,
     ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
-        let client =
-            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
+        let client = PageserverClient::new(
+            node.get_id(),
+            node.base_url(),
+            self.service_config.jwt_token.as_deref(),
+        );
 
         let timelines = client.timeline_list(&tenant_shard_id).await?;
         Ok(timelines
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index e38007c7af..aa930014b2 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -27,6 +27,7 @@ use pageserver_api::{
     models::{SecondaryProgress, TenantConfigRequest},
 };
 
+use crate::pageserver_client::PageserverClient;
 use pageserver_api::{
     models::{
         self, LocationConfig, LocationConfigListResponse, LocationConfigMode,
@@ -209,6 +210,7 @@ struct ShardSplitParams {
     new_stripe_size: Option<ShardStripeSize>,
     targets: Vec<ShardSplitTarget>,
     policy: PlacementPolicy,
+    config: TenantConfig,
     shard_ident: ShardIdentity,
 }
 
@@ -551,7 +553,11 @@ impl Service {
                 break;
             }
 
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            let client = PageserverClient::new(
+                node.get_id(),
+                node.base_url(),
+                self.config.jwt_token.as_deref(),
+            );
             match client
                 .location_config(
                     tenant_shard_id,
@@ -1388,7 +1394,8 @@ impl Service {
             incremented_generations.len()
         );
 
-        // Apply the updated generation to our in-memory state
+        // Apply the updated generation to our in-memory state, and
+        // gather discover secondary locations.
         let mut locked = self.inner.write().unwrap();
         let (nodes, tenants, scheduler) = locked.parts_mut();
 
@@ -1396,62 +1403,65 @@ impl Service {
             tenants: Vec::new(),
         };
 
-        for (tenant_shard_id, new_gen) in incremented_generations {
-            response.tenants.push(ReAttachResponseTenant {
-                id: tenant_shard_id,
-                gen: new_gen.into().unwrap(),
-            });
-            // Apply the new generation number to our in-memory state
-            let shard_state = tenants.get_mut(&tenant_shard_id);
-            let Some(shard_state) = shard_state else {
-                // Not fatal.  This edge case requires a re-attach to happen
-                // between inserting a new tenant shard in to the database, and updating our in-memory
-                // state to know about the shard, _and_ that the state inserted to the database referenced
-                // a pageserver.  Should never happen, but handle it rather than panicking, since it should
-                // be harmless.
-                tracing::error!(
-                    "Shard {} is in database for node {} but not in-memory state",
-                    tenant_shard_id,
-                    reattach_req.node_id
-                );
-                continue;
-            };
+        // TODO: cancel/restart any running reconciliation for this tenant, it might be trying
+        // to call location_conf API with an old generation.  Wait for cancellation to complete
+        // before responding to this request.  Requires well implemented CancellationToken logic
+        // all the way to where we call location_conf.  Even then, there can still be a location_conf
+        // request in flight over the network: TODO handle that by making location_conf API refuse
+        // to go backward in generations.
 
-            // If [`Persistence::re_attach`] selected this shard, it must have alread
-            // had a generation set.
-            debug_assert!(shard_state.generation.is_some());
-            let Some(old_gen) = shard_state.generation else {
-                // Should never happen:  would only return incremented generation
-                // for a tenant that already had a non-null generation.
-                return Err(ApiError::InternalServerError(anyhow::anyhow!(
-                    "Generation must be set while re-attaching"
-                )));
-            };
-            shard_state.generation = Some(std::cmp::max(old_gen, new_gen));
-            if let Some(observed) = shard_state
-                .observed
-                .locations
-                .get_mut(&reattach_req.node_id)
-            {
-                if let Some(conf) = observed.conf.as_mut() {
-                    conf.generation = new_gen.into();
+        // Scan through all shards, applying updates for ones where we updated generation
+        // and identifying shards that intend to have a secondary location on this node.
+        for (tenant_shard_id, shard) in tenants {
+            if let Some(new_gen) = incremented_generations.get(tenant_shard_id) {
+                let new_gen = *new_gen;
+                response.tenants.push(ReAttachResponseTenant {
+                    id: *tenant_shard_id,
+                    gen: Some(new_gen.into().unwrap()),
+                    // A tenant is only put into multi or stale modes in the middle of a [`Reconciler::live_migrate`]
+                    // execution.  If a pageserver is restarted during that process, then the reconcile pass will
+                    // fail, and start from scratch, so it doesn't make sense for us to try and preserve
+                    // the stale/multi states at this point.
+                    mode: LocationConfigMode::AttachedSingle,
+                });
+
+                shard.generation = std::cmp::max(shard.generation, Some(new_gen));
+                if let Some(observed) = shard.observed.locations.get_mut(&reattach_req.node_id) {
+                    // Why can we update `observed` even though we're not sure our response will be received
+                    // by the pageserver?  Because the pageserver will not proceed with startup until
+                    // it has processed response: if it loses it, we'll see another request and increment
+                    // generation again, avoiding any uncertainty about dirtiness of tenant's state.
+                    if let Some(conf) = observed.conf.as_mut() {
+                        conf.generation = new_gen.into();
+                    }
+                } else {
+                    // This node has no observed state for the shard: perhaps it was offline
+                    // when the pageserver restarted.  Insert a None, so that the Reconciler
+                    // will be prompted to learn the location's state before it makes changes.
+                    shard
+                        .observed
+                        .locations
+                        .insert(reattach_req.node_id, ObservedStateLocation { conf: None });
                 }
-            } else {
-                // This node has no observed state for the shard: perhaps it was offline
-                // when the pageserver restarted.  Insert a None, so that the Reconciler
-                // will be prompted to learn the location's state before it makes changes.
-                shard_state
-                    .observed
-                    .locations
-                    .insert(reattach_req.node_id, ObservedStateLocation { conf: None });
-            }
+            } else if shard.intent.get_secondary().contains(&reattach_req.node_id) {
+                // Ordering: pageserver will not accept /location_config requests until it has
+                // finished processing the response from re-attach.  So we can update our in-memory state
+                // now, and be confident that we are not stamping on the result of some later location config.
+                // TODO: however, we are not strictly ordered wrt ReconcileResults queue,
+                // so we might update observed state here, and then get over-written by some racing
+                // ReconcileResult.  The impact is low however, since we have set state on pageserver something
+                // that matches intent, so worst case if we race then we end up doing a spurious reconcile.
 
-            // TODO: cancel/restart any running reconciliation for this tenant, it might be trying
-            // to call location_conf API with an old generation.  Wait for cancellation to complete
-            // before responding to this request.  Requires well implemented CancellationToken logic
-            // all the way to where we call location_conf.  Even then, there can still be a location_conf
-            // request in flight over the network: TODO handle that by making location_conf API refuse
-            // to go backward in generations.
+                response.tenants.push(ReAttachResponseTenant {
+                    id: *tenant_shard_id,
+                    gen: None,
+                    mode: LocationConfigMode::Secondary,
+                });
+
+                // We must not update observed, because we have no guarantee that our
+                // response will be received by the pageserver. This could leave it
+                // falsely dirty, but the resulting reconcile should be idempotent.
+            }
         }
 
         // We consider a node Active once we have composed a re-attach response, but we
@@ -2096,8 +2106,11 @@ impl Service {
                 })
                 .collect::<Vec<_>>();
             for tenant_shard_id in shard_ids {
-                let client =
-                    mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+                let client = PageserverClient::new(
+                    node.get_id(),
+                    node.base_url(),
+                    self.config.jwt_token.as_deref(),
+                );
 
                 tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",);
 
@@ -2149,7 +2162,11 @@ impl Service {
         // Issue concurrent requests to all shards' locations
         let mut futs = FuturesUnordered::new();
         for (tenant_shard_id, node) in targets {
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            let client = PageserverClient::new(
+                node.get_id(),
+                node.base_url(),
+                self.config.jwt_token.as_deref(),
+            );
             futs.push(async move {
                 let result = client
                     .tenant_secondary_download(tenant_shard_id, wait)
@@ -2242,7 +2259,11 @@ impl Service {
         // Phase 1: delete on the pageservers
         let mut any_pending = false;
         for (tenant_shard_id, node) in targets {
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            let client = PageserverClient::new(
+                node.get_id(),
+                node.base_url(),
+                self.config.jwt_token.as_deref(),
+            );
             // TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not
             // surface immediately as an error to our caller.
             let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| {
@@ -2354,7 +2375,7 @@ impl Service {
                 tenant_shard_id,
                 create_req.new_timeline_id,
             );
-            let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
+            let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
 
             client
                 .timeline_create(tenant_shard_id, &create_req)
@@ -2478,7 +2499,7 @@ impl Service {
                 "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
             );
 
-            let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
+            let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
             client
                 .timeline_delete(tenant_shard_id, timeline_id)
                 .await
@@ -2519,11 +2540,11 @@ impl Service {
     }
 
     /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
-    /// function looks it up and returns the url.  If the tenant isn't found, returns Err(ApiError::NotFound)
-    pub(crate) fn tenant_shard0_baseurl(
+    /// function looks up and returns node. If the tenant isn't found, returns Err(ApiError::NotFound)
+    pub(crate) fn tenant_shard0_node(
         &self,
         tenant_id: TenantId,
-    ) -> Result<(String, TenantShardId), ApiError> {
+    ) -> Result<(Node, TenantShardId), ApiError> {
         let locked = self.inner.read().unwrap();
         let Some((tenant_shard_id, shard)) = locked
             .tenants
@@ -2555,7 +2576,7 @@ impl Service {
             )));
         };
 
-        Ok((node.base_url(), *tenant_shard_id))
+        Ok((node.clone(), *tenant_shard_id))
     }
 
     pub(crate) fn tenant_locate(
@@ -2725,7 +2746,7 @@ impl Service {
         let detach_locations: Vec<(Node, TenantShardId)> = {
             let mut detach_locations = Vec::new();
             let mut locked = self.inner.write().unwrap();
-            let (nodes, tenants, _scheduler) = locked.parts_mut();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
 
             for (tenant_shard_id, shard) in
                 tenants.range_mut(TenantShardId::tenant_range(op.tenant_id))
@@ -2758,6 +2779,13 @@ impl Service {
 
                 tracing::info!("Restoring parent shard {tenant_shard_id}");
                 shard.splitting = SplitState::Idle;
+                if let Err(e) = shard.schedule(scheduler) {
+                    // If this shard can't be scheduled now (perhaps due to offline nodes or
+                    // capacity issues), that must not prevent us rolling back a split.  In this
+                    // case it should be eventually scheduled in the background.
+                    tracing::warn!("Failed to schedule {tenant_shard_id} during shard abort: {e}")
+                }
+
                 self.maybe_reconcile_shard(shard, nodes);
             }
 
@@ -2849,7 +2877,7 @@ impl Service {
                 .map(|(shard_id, _)| *shard_id)
                 .collect::<Vec<_>>();
 
-            let (_nodes, tenants, scheduler) = locked.parts_mut();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
             for parent_id in parent_ids {
                 let child_ids = parent_id.split(new_shard_count);
 
@@ -2916,6 +2944,8 @@ impl Service {
                         // find a secondary (e.g. because cluster is overloaded).
                         tracing::warn!("Failed to schedule child shard {child}: {e}");
                     }
+                    // In the background, attach secondary locations for the new shards
+                    self.maybe_reconcile_shard(&mut child_state, nodes);
 
                     tenants.insert(child, child_state);
                     response.new_shards.push(child);
@@ -2980,6 +3010,7 @@ impl Service {
         )));
 
         let mut policy = None;
+        let mut config = None;
         let mut shard_ident = None;
         // Validate input, and calculate which shards we will create
         let (old_shard_count, targets) =
@@ -3036,6 +3067,9 @@ impl Service {
                     if shard_ident.is_none() {
                         shard_ident = Some(shard.shard);
                     }
+                    if config.is_none() {
+                        config = Some(shard.config.clone());
+                    }
 
                     if tenant_shard_id.shard_count.count() == split_req.new_shard_count {
                         tracing::info!(
@@ -3054,8 +3088,6 @@ impl Service {
                         .get(&node_id)
                         .expect("Pageservers may not be deleted while referenced");
 
-                    // TODO: if any reconciliation is currently in progress for this shard, wait for it.
-
                     targets.push(ShardSplitTarget {
                         parent_id: *tenant_shard_id,
                         node: node.clone(),
@@ -3098,6 +3130,7 @@ impl Service {
             shard_ident.unwrap()
         };
         let policy = policy.unwrap();
+        let config = config.unwrap();
 
         Ok(ShardSplitAction::Split(ShardSplitParams {
             old_shard_count,
@@ -3105,6 +3138,7 @@ impl Service {
             new_stripe_size: split_req.new_stripe_size,
             targets,
             policy,
+            config,
             shard_ident,
         }))
     }
@@ -3124,11 +3158,49 @@ impl Service {
             old_shard_count,
             new_shard_count,
             new_stripe_size,
-            targets,
+            mut targets,
             policy,
+            config,
             shard_ident,
         } = params;
 
+        // Drop any secondary locations: pageservers do not support splitting these, and in any case the
+        // end-state for a split tenant will usually be to have secondary locations on different nodes.
+        // The reconciliation calls in this block also implicitly cancel+barrier wrt any ongoing reconciliation
+        // at the time of split.
+        let waiters = {
+            let mut locked = self.inner.write().unwrap();
+            let mut waiters = Vec::new();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
+            for target in &mut targets {
+                let Some(shard) = tenants.get_mut(&target.parent_id) else {
+                    // Paranoia check: this shouldn't happen: we have the oplock for this tenant ID.
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                        "Shard {} not found",
+                        target.parent_id
+                    )));
+                };
+
+                if shard.intent.get_attached() != &Some(target.node.get_id()) {
+                    // Paranoia check: this shouldn't happen: we have the oplock for this tenant ID.
+                    return Err(ApiError::Conflict(format!(
+                        "Shard {} unexpectedly rescheduled during split",
+                        target.parent_id
+                    )));
+                }
+
+                // Irrespective of PlacementPolicy, clear secondary locations from intent
+                shard.intent.clear_secondary(scheduler);
+
+                // Run Reconciler to execute detach fo secondary locations.
+                if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
+                    waiters.push(waiter);
+                }
+            }
+            waiters
+        };
+        self.await_waiters(waiters, RECONCILE_TIMEOUT).await?;
+
         // Before creating any new child shards in memory or on the pageservers, persist them: this
         // enables us to ensure that we will always be able to clean up if something goes wrong.  This also
         // acts as the protection against two concurrent attempts to split: one of them will get a database
@@ -3157,8 +3229,7 @@ impl Service {
                     generation: None,
                     generation_pageserver: Some(target.node.get_id().0 as i64),
                     placement_policy: serde_json::to_string(&policy).unwrap(),
-                    // TODO: get the config out of the map
-                    config: serde_json::to_string(&TenantConfig::default()).unwrap(),
+                    config: serde_json::to_string(&config).unwrap(),
                     splitting: SplitState::Splitting,
                 });
             }
@@ -3215,7 +3286,11 @@ impl Service {
                 node,
                 child_ids,
             } = target;
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            let client = PageserverClient::new(
+                node.get_id(),
+                node.base_url(),
+                self.config.jwt_token.as_deref(),
+            );
             let response = client
                 .tenant_shard_split(
                     *parent_id,
@@ -3343,6 +3418,11 @@ impl Service {
                         // If we were already attached to something, demote that to a secondary
                         if let Some(old_attached) = old_attached {
                             if n > 0 {
+                                // Remove other secondaries to make room for the location we'll demote
+                                while shard.intent.get_secondary().len() >= n {
+                                    shard.intent.pop_secondary(scheduler);
+                                }
+
                                 shard.intent.push_secondary(scheduler, old_attached);
                             }
                         }
@@ -3370,7 +3450,7 @@ impl Service {
         if let Some(waiter) = waiter {
             waiter.wait_timeout(RECONCILE_TIMEOUT).await?;
         } else {
-            tracing::warn!("Migration is a no-op");
+            tracing::info!("Migration is a no-op");
         }
 
         Ok(TenantShardMigrateResponse {})
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 9dd368bf41..83c921dc58 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -4,7 +4,10 @@ use std::{
     time::Duration,
 };
 
-use crate::{metrics, persistence::TenantShardPersistence};
+use crate::{
+    metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
+    persistence::TenantShardPersistence,
+};
 use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::{
     models::{LocationConfig, LocationConfigMode, TenantConfig},
@@ -718,7 +721,10 @@ impl TenantState {
         let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
                                                         tenant_id=%reconciler.tenant_shard_id.tenant_id,
                                                         shard_id=%reconciler.tenant_shard_id.shard_slug());
-        metrics::RECONCILER.spawned.inc();
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_reconcile_spawn
+            .inc();
         let result_tx = result_tx.clone();
         let join_handle = tokio::task::spawn(
             async move {
@@ -736,10 +742,12 @@ impl TenantState {
                 // TODO: wrap all remote API operations in cancellation check
                 // as well.
                 if reconciler.cancel.is_cancelled() {
-                    metrics::RECONCILER
-                        .complete
-                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
-                        .inc();
+                    metrics::METRICS_REGISTRY
+                        .metrics_group
+                        .storage_controller_reconcile_complete
+                        .inc(ReconcileCompleteLabelGroup {
+                            status: ReconcileOutcome::Cancel,
+                        });
                     return;
                 }
 
@@ -754,18 +762,18 @@ impl TenantState {
                 }
 
                 // Update result counter
-                match &result {
-                    Ok(_) => metrics::RECONCILER
-                        .complete
-                        .with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
-                    Err(ReconcileError::Cancel) => metrics::RECONCILER
-                        .complete
-                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
-                    Err(_) => metrics::RECONCILER
-                        .complete
-                        .with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
-                }
-                .inc();
+                let outcome_label = match &result {
+                    Ok(_) => ReconcileOutcome::Success,
+                    Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel,
+                    Err(_) => ReconcileOutcome::Error,
+                };
+
+                metrics::METRICS_REGISTRY
+                    .metrics_group
+                    .storage_controller_reconcile_complete
+                    .inc(ReconcileCompleteLabelGroup {
+                        status: outcome_label,
+                    });
 
                 result_tx
                     .send(ReconcileResult {
diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs
index 5472948091..2e88836bd0 100644
--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -6,7 +6,9 @@
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;
 
-use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId};
+use crate::{
+    controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId,
+};
 
 /// Upcall message sent by the pageserver to the configured `control_plane_api` on
 /// startup.
@@ -20,12 +22,20 @@ pub struct ReAttachRequest {
     pub register: Option<NodeRegisterRequest>,
 }
 
-#[derive(Serialize, Deserialize)]
-pub struct ReAttachResponseTenant {
-    pub id: TenantShardId,
-    pub gen: u32,
+fn default_mode() -> LocationConfigMode {
+    LocationConfigMode::AttachedSingle
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+pub struct ReAttachResponseTenant {
+    pub id: TenantShardId,
+    /// Mandatory if LocationConfigMode is None or set to an Attached* mode
+    pub gen: Option<u32>,
+
+    /// Default value only for backward compat: this field should be set
+    #[serde(default = "default_mode")]
+    pub mode: LocationConfigMode,
+}
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponse {
     pub tenants: Vec<ReAttachResponseTenant>,
diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index a60971abf0..f8a5f68131 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -245,7 +245,7 @@ impl std::io::Write for ChannelWriter {
     }
 }
 
-async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
     SERVE_METRICS_COUNT.inc();
 
     let started_at = std::time::Instant::now();
@@ -367,7 +367,6 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
         .middleware(Middleware::post_with_info(
             add_request_id_header_to_response,
         ))
-        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
         .err_handler(route_error_handler)
 }
 
diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index a3aee45b58..8eee1f72a6 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -245,7 +245,7 @@ impl<'a, T> Guard<'a, T> {
     ///
     /// The permit will be on a semaphore part of the new internal value, and any following
     /// [`OnceCell::get_or_init`] will wait on it to complete.
-    pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
+    pub fn take_and_deinit(mut self) -> (T, InitPermit) {
         let mut swapped = Inner::default();
         let sem = swapped.init_semaphore.clone();
         // acquire and forget right away, moving the control over to InitPermit
@@ -543,7 +543,7 @@ mod tests {
         target.set(42, permit);
 
         let (_answer, permit) = {
-            let mut guard = target
+            let guard = target
                 .get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
                 .await
                 .unwrap();
diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs
index 9953b447c8..18b2af14f1 100644
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -1,27 +1,60 @@
 use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};
 
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum VecMapOrdering {
+    Greater,
+    GreaterOrEqual,
+}
+
 /// Ordered map datastructure implemented in a Vec.
 /// Append only - can only add keys that are larger than the
 /// current max key.
+/// Ordering can be adjusted using [`VecMapOrdering`]
+/// during `VecMap` construction.
 #[derive(Clone, Debug)]
-pub struct VecMap<K, V>(Vec<(K, V)>);
+pub struct VecMap<K, V> {
+    data: Vec<(K, V)>,
+    ordering: VecMapOrdering,
+}
 
 impl<K, V> Default for VecMap<K, V> {
     fn default() -> Self {
-        VecMap(Default::default())
+        VecMap {
+            data: Default::default(),
+            ordering: VecMapOrdering::Greater,
+        }
     }
 }
 
-#[derive(Debug)]
-pub struct InvalidKey;
+#[derive(thiserror::Error, Debug)]
+pub enum VecMapError {
+    #[error("Key violates ordering constraint")]
+    InvalidKey,
+    #[error("Mismatched ordering constraints")]
+    ExtendOrderingError,
+}
 
 impl<K: Ord, V> VecMap<K, V> {
+    pub fn new(ordering: VecMapOrdering) -> Self {
+        Self {
+            data: Vec::new(),
+            ordering,
+        }
+    }
+
+    pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self {
+        Self {
+            data: Vec::with_capacity(capacity),
+            ordering,
+        }
+    }
+
     pub fn is_empty(&self) -> bool {
-        self.0.is_empty()
+        self.data.is_empty()
     }
 
     pub fn as_slice(&self) -> &[(K, V)] {
-        self.0.as_slice()
+        self.data.as_slice()
     }
 
     /// This function may panic if given a range where the lower bound is
@@ -29,7 +62,7 @@ impl<K: Ord, V> VecMap<K, V> {
     pub fn slice_range<R: RangeBounds<K>>(&self, range: R) -> &[(K, V)] {
         use std::ops::Bound::*;
 
-        let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key);
+        let binary_search = |k: &K| self.data.binary_search_by_key(&k, extract_key);
 
         let start_idx = match range.start_bound() {
             Unbounded => 0,
@@ -41,7 +74,7 @@ impl<K: Ord, V> VecMap<K, V> {
         };
 
         let end_idx = match range.end_bound() {
-            Unbounded => self.0.len(),
+            Unbounded => self.data.len(),
             Included(k) => match binary_search(k) {
                 Ok(idx) => idx + 1,
                 Err(idx) => idx,
@@ -49,34 +82,30 @@ impl<K: Ord, V> VecMap<K, V> {
             Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity),
         };
 
-        &self.0[start_idx..end_idx]
+        &self.data[start_idx..end_idx]
     }
 
     /// Add a key value pair to the map.
-    /// If `key` is less than or equal to the current maximum key
-    /// the pair will not be added and InvalidKey error will be returned.
-    pub fn append(&mut self, key: K, value: V) -> Result<usize, InvalidKey> {
-        if let Some((last_key, _last_value)) = self.0.last() {
-            if &key <= last_key {
-                return Err(InvalidKey);
-            }
-        }
+    /// If `key` is not respective of the `self` ordering the
+    /// pair will not be added and `InvalidKey` error will be returned.
+    pub fn append(&mut self, key: K, value: V) -> Result<usize, VecMapError> {
+        self.validate_key_order(&key)?;
 
         let delta_size = self.instrument_vec_op(|vec| vec.push((key, value)));
         Ok(delta_size)
     }
 
     /// Update the maximum key value pair or add a new key value pair to the map.
-    /// If `key` is less than the current maximum key no updates or additions
-    /// will occur and InvalidKey error will be returned.
+    /// If `key` is not respective of the `self` ordering no updates or additions
+    /// will occur and `InvalidKey` error will be returned.
     pub fn append_or_update_last(
         &mut self,
         key: K,
         mut value: V,
-    ) -> Result<(Option<V>, usize), InvalidKey> {
-        if let Some((last_key, last_value)) = self.0.last_mut() {
+    ) -> Result<(Option<V>, usize), VecMapError> {
+        if let Some((last_key, last_value)) = self.data.last_mut() {
             match key.cmp(last_key) {
-                Ordering::Less => return Err(InvalidKey),
+                Ordering::Less => return Err(VecMapError::InvalidKey),
                 Ordering::Equal => {
                     std::mem::swap(last_value, &mut value);
                     const DELTA_SIZE: usize = 0;
@@ -100,40 +129,67 @@ impl<K: Ord, V> VecMap<K, V> {
         V: Clone,
     {
         let split_idx = self
-            .0
+            .data
             .binary_search_by_key(&cutoff, extract_key)
             .unwrap_or_else(std::convert::identity);
 
         (
-            VecMap(self.0[..split_idx].to_vec()),
-            VecMap(self.0[split_idx..].to_vec()),
+            VecMap {
+                data: self.data[..split_idx].to_vec(),
+                ordering: self.ordering,
+            },
+            VecMap {
+                data: self.data[split_idx..].to_vec(),
+                ordering: self.ordering,
+            },
         )
     }
 
     /// Move items from `other` to the end of `self`, leaving `other` empty.
-    /// If any keys in `other` is less than or equal to any key in `self`,
-    /// `InvalidKey` error will be returned and no mutation will occur.
-    pub fn extend(&mut self, other: &mut Self) -> Result<usize, InvalidKey> {
-        let self_last_opt = self.0.last().map(extract_key);
-        let other_first_opt = other.0.last().map(extract_key);
+    /// If the `other` ordering is different from `self` ordering
+    /// `ExtendOrderingError` error will be returned.
+    /// If any keys in `other` is not respective of the ordering defined in
+    /// `self`, `InvalidKey` error will be returned and no mutation will occur.
+    pub fn extend(&mut self, other: &mut Self) -> Result<usize, VecMapError> {
+        if self.ordering != other.ordering {
+            return Err(VecMapError::ExtendOrderingError);
+        }
 
-        if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) {
-            if self_last >= other_first {
-                return Err(InvalidKey);
+        let other_first_opt = other.data.last().map(extract_key);
+        if let Some(other_first) = other_first_opt {
+            self.validate_key_order(other_first)?;
+        }
+
+        let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.data));
+        Ok(delta_size)
+    }
+
+    /// Validate the current last key in `self` and key being
+    /// inserted against the order defined in `self`.
+    fn validate_key_order(&self, key: &K) -> Result<(), VecMapError> {
+        if let Some(last_key) = self.data.last().map(extract_key) {
+            match (&self.ordering, &key.cmp(last_key)) {
+                (VecMapOrdering::Greater, Ordering::Less | Ordering::Equal) => {
+                    return Err(VecMapError::InvalidKey);
+                }
+                (VecMapOrdering::Greater, Ordering::Greater) => {}
+                (VecMapOrdering::GreaterOrEqual, Ordering::Less) => {
+                    return Err(VecMapError::InvalidKey);
+                }
+                (VecMapOrdering::GreaterOrEqual, Ordering::Equal | Ordering::Greater) => {}
             }
         }
 
-        let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.0));
-        Ok(delta_size)
+        Ok(())
     }
 
     /// Instrument an operation on the underlying [`Vec`].
     /// Will panic if the operation decreases capacity.
     /// Returns the increase in memory usage caused by the op.
     fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize {
-        let old_cap = self.0.capacity();
-        op(&mut self.0);
-        let new_cap = self.0.capacity();
+        let old_cap = self.data.capacity();
+        op(&mut self.data);
+        let new_cap = self.data.capacity();
 
         match old_cap.cmp(&new_cap) {
             Ordering::Less => {
@@ -145,6 +201,36 @@ impl<K: Ord, V> VecMap<K, V> {
             Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"),
         }
     }
+
+    /// Similar to `from_iter` defined in `FromIter` trait except
+    /// that it accepts an [`VecMapOrdering`]
+    pub fn from_iter<I: IntoIterator<Item = (K, V)>>(iter: I, ordering: VecMapOrdering) -> Self {
+        let iter = iter.into_iter();
+        let initial_capacity = {
+            match iter.size_hint() {
+                (lower_bound, None) => lower_bound,
+                (_, Some(upper_bound)) => upper_bound,
+            }
+        };
+
+        let mut vec_map = VecMap::with_capacity(initial_capacity, ordering);
+        for (key, value) in iter {
+            vec_map
+                .append(key, value)
+                .expect("The passed collection needs to be sorted!");
+        }
+
+        vec_map
+    }
+}
+
+impl<K: Ord, V> IntoIterator for VecMap<K, V> {
+    type Item = (K, V);
+    type IntoIter = std::vec::IntoIter<(K, V)>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.data.into_iter()
+    }
 }
 
 fn extract_key<K, V>(entry: &(K, V)) -> &K {
@@ -155,7 +241,7 @@ fn extract_key<K, V>(entry: &(K, V)) -> &K {
 mod tests {
     use std::{collections::BTreeMap, ops::Bound};
 
-    use super::VecMap;
+    use super::{VecMap, VecMapOrdering};
 
     #[test]
     fn unbounded_range() {
@@ -310,5 +396,59 @@ mod tests {
         left.extend(&mut one_map).unwrap_err();
         assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
         assert_eq!(one_map.as_slice(), &[(1, ())]);
+
+        let mut map_greater_or_equal = VecMap::new(VecMapOrdering::GreaterOrEqual);
+        map_greater_or_equal.append(2, ()).unwrap();
+        map_greater_or_equal.append(2, ()).unwrap();
+
+        left.extend(&mut map_greater_or_equal).unwrap_err();
+        assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
+        assert_eq!(map_greater_or_equal.as_slice(), &[(2, ()), (2, ())]);
+    }
+
+    #[test]
+    fn extend_with_ordering() {
+        let mut left = VecMap::new(VecMapOrdering::GreaterOrEqual);
+        left.append(0, ()).unwrap();
+        assert_eq!(left.as_slice(), &[(0, ())]);
+
+        let mut greater_right = VecMap::new(VecMapOrdering::Greater);
+        greater_right.append(0, ()).unwrap();
+        left.extend(&mut greater_right).unwrap_err();
+        assert_eq!(left.as_slice(), &[(0, ())]);
+
+        let mut greater_or_equal_right = VecMap::new(VecMapOrdering::GreaterOrEqual);
+        greater_or_equal_right.append(2, ()).unwrap();
+        greater_or_equal_right.append(2, ()).unwrap();
+        left.extend(&mut greater_or_equal_right).unwrap();
+        assert_eq!(left.as_slice(), &[(0, ()), (2, ()), (2, ())]);
+    }
+
+    #[test]
+    fn vec_map_from_sorted() {
+        let vec = vec![(1, ()), (2, ()), (3, ()), (6, ())];
+        let vec_map = VecMap::from_iter(vec, VecMapOrdering::Greater);
+        assert_eq!(vec_map.as_slice(), &[(1, ()), (2, ()), (3, ()), (6, ())]);
+
+        let vec = vec![(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())];
+        let vec_map = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
+        assert_eq!(
+            vec_map.as_slice(),
+            &[(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())]
+        );
+    }
+
+    #[test]
+    #[should_panic]
+    fn vec_map_from_unsorted_greater() {
+        let vec = vec![(1, ()), (2, ()), (2, ()), (3, ()), (6, ())];
+        let _ = VecMap::from_iter(vec, VecMapOrdering::Greater);
+    }
+
+    #[test]
+    #[should_panic]
+    fn vec_map_from_unsorted_greater_or_equal() {
+        let vec = vec![(1, ()), (2, ()), (3, ()), (6, ()), (5, ())];
+        let _ = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
     }
 }
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 1b3d76335d..42c800822b 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -5,7 +5,8 @@ use pageserver_api::{
     controller_api::NodeRegisterRequest,
     shard::TenantShardId,
     upcall_api::{
-        ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
+        ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
+        ValidateRequestTenant, ValidateResponse,
     },
 };
 use serde::{de::DeserializeOwned, Serialize};
@@ -37,7 +38,9 @@ pub trait ControlPlaneGenerationsApi {
     fn re_attach(
         &self,
         conf: &PageServerConf,
-    ) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
+    ) -> impl Future<
+        Output = Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError>,
+    > + Send;
     fn validate(
         &self,
         tenants: Vec<(TenantShardId, Generation)>,
@@ -118,7 +121,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
     async fn re_attach(
         &self,
         conf: &PageServerConf,
-    ) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
+    ) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
         let re_attach_path = self
             .base_url
             .join("re-attach")
@@ -181,7 +184,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
         Ok(response
             .tenants
             .into_iter()
-            .map(|t| (t.id, Generation::new(t.gen)))
+            .map(|rart| (rart.id, rart))
             .collect::<HashMap<_, _>>())
     }
 
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index b6aea8fae8..e3c11cb299 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -724,8 +724,8 @@ impl DeletionQueue {
 mod test {
     use camino::Utf8Path;
     use hex_literal::hex;
-    use pageserver_api::shard::ShardIndex;
-    use std::io::ErrorKind;
+    use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant};
+    use std::{io::ErrorKind, time::Duration};
     use tracing::info;
 
     use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
@@ -834,9 +834,10 @@ mod test {
         async fn re_attach(
             &self,
             _conf: &PageServerConf,
-        ) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
+        ) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
             unimplemented!()
         }
+
         async fn validate(
             &self,
             tenants: Vec<(TenantShardId, Generation)>,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 229f3ae98f..26f23fb8c2 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -36,6 +36,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
+use utils::http::endpoint::prometheus_metrics_handler;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -2266,6 +2267,7 @@ pub fn make_router(
 
     Ok(router
         .data(state)
+        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
         .get("/v1/status", |r| api_handler(r, status_handler))
         .put("/v1/failpoints", |r| {
             testing_api_handler("manage failpoints", r, failpoints_handler)
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 727650a5a5..6f7d74bdee 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -34,6 +34,7 @@ use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
+use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
 const MAX_AUX_FILE_DELTAS: usize = 1024;
@@ -1546,12 +1547,13 @@ impl<'a> DatadirModification<'a> {
         if !self.pending_updates.is_empty() {
             // The put_batch call below expects expects the inputs to be sorted by Lsn,
             // so we do that first.
-            let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = self
-                .pending_updates
-                .drain()
-                .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
-                .kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
-                .collect();
+            let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
+                self.pending_updates
+                    .drain()
+                    .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
+                    .kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
+                VecMapOrdering::GreaterOrEqual,
+            );
 
             writer.put_batch(lsn_ordered_batch, ctx).await?;
         }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7a6ddd6a4e..fe48741a89 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -202,6 +202,13 @@ pub(super) struct AttachedTenantConf {
 }
 
 impl AttachedTenantConf {
+    fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self {
+        Self {
+            tenant_conf,
+            location,
+        }
+    }
+
     fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
         match &location_conf.mode {
             LocationMode::Attached(attach_conf) => Ok(Self {
@@ -678,9 +685,20 @@ impl Tenant {
                 }
 
                 // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
+                enum BrokenVerbosity {
+                    Error,
+                    Info
+                }
                 let make_broken =
-                    |t: &Tenant, err: anyhow::Error| {
-                        error!("attach failed, setting tenant state to Broken: {err:?}");
+                    |t: &Tenant, err: anyhow::Error, verbosity: BrokenVerbosity| {
+                        match verbosity {
+                            BrokenVerbosity::Info => {
+                                info!("attach cancelled, setting tenant state to Broken: {err}");
+                            },
+                            BrokenVerbosity::Error => {
+                                error!("attach failed, setting tenant state to Broken: {err:?}");
+                            }
+                        }
                         t.state.send_modify(|state| {
                             // The Stopping case is for when we have passed control on to DeleteTenantFlow:
                             // if it errors, we will call make_broken when tenant is already in Stopping.
@@ -744,7 +762,7 @@ impl Tenant {
                             // Make the tenant broken so that set_stopping will not hang waiting for it to leave
                             // the Attaching state.  This is an over-reaction (nothing really broke, the tenant is
                             // just shutting down), but ensures progress.
-                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
+                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"), BrokenVerbosity::Info);
                             return Ok(());
                         },
                     )
@@ -766,7 +784,7 @@ impl Tenant {
                         match res {
                             Ok(p) => Some(p),
                             Err(e) => {
-                                make_broken(&tenant_clone, anyhow::anyhow!(e));
+                                make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
                                 return Ok(());
                             }
                         }
@@ -790,7 +808,7 @@ impl Tenant {
                     {
                         Ok(should_resume_deletion) => should_resume_deletion,
                         Err(err) => {
-                            make_broken(&tenant_clone, anyhow::anyhow!(err));
+                            make_broken(&tenant_clone, anyhow::anyhow!(err), BrokenVerbosity::Error);
                             return Ok(());
                         }
                     }
@@ -820,7 +838,7 @@ impl Tenant {
                     .await;
 
                     if let Err(e) = deleted {
-                        make_broken(&tenant_clone, anyhow::anyhow!(e));
+                        make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
                     }
 
                     return Ok(());
@@ -841,7 +859,7 @@ impl Tenant {
                         tenant_clone.activate(broker_client, None, &ctx);
                     }
                     Err(e) => {
-                        make_broken(&tenant_clone, anyhow::anyhow!(e));
+                        make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
                     }
                 }
 
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 57fc444cdd..53a8c97e23 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -196,16 +196,17 @@ impl LocationConf {
     /// For use when attaching/re-attaching: update the generation stored in this
     /// structure.  If we were in a secondary state, promote to attached (posession
     /// of a fresh generation implies this).
-    pub(crate) fn attach_in_generation(&mut self, generation: Generation) {
+    pub(crate) fn attach_in_generation(&mut self, mode: AttachmentMode, generation: Generation) {
         match &mut self.mode {
             LocationMode::Attached(attach_conf) => {
                 attach_conf.generation = generation;
+                attach_conf.attach_mode = mode;
             }
             LocationMode::Secondary(_) => {
                 // We are promoted to attached by the control plane's re-attach response
                 self.mode = LocationMode::Attached(AttachedLocationConfig {
                     generation,
-                    attach_mode: AttachmentMode::Single,
+                    attach_mode: mode,
                 })
             }
         }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index f456ca3006..97a505ded9 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,13 +2,13 @@
 //! page server.
 
 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
-use futures::stream::StreamExt;
 use itertools::Itertools;
 use pageserver_api::key::Key;
-use pageserver_api::models::ShardParameters;
+use pageserver_api::models::{LocationConfigMode, ShardParameters};
 use pageserver_api::shard::{
     ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
 };
+use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::cmp::Ordering;
@@ -125,6 +125,46 @@ pub(crate) enum ShardSelector {
     Page(Key),
 }
 
+/// A convenience for use with the re_attach ControlPlaneClient function: rather
+/// than the serializable struct, we build this enum that encapsulates
+/// the invariant that attached tenants always have generations.
+///
+/// This represents the subset of a LocationConfig that we receive during re-attach.
+pub(crate) enum TenantStartupMode {
+    Attached((AttachmentMode, Generation)),
+    Secondary,
+}
+
+impl TenantStartupMode {
+    /// Return the generation & mode that should be used when starting
+    /// this tenant.
+    ///
+    /// If this returns None, the re-attach struct is in an invalid state and
+    /// should be ignored in the response.
+    fn from_reattach_tenant(rart: ReAttachResponseTenant) -> Option<Self> {
+        match (rart.mode, rart.gen) {
+            (LocationConfigMode::Detached, _) => None,
+            (LocationConfigMode::Secondary, _) => Some(Self::Secondary),
+            (LocationConfigMode::AttachedMulti, Some(g)) => {
+                Some(Self::Attached((AttachmentMode::Multi, Generation::new(g))))
+            }
+            (LocationConfigMode::AttachedSingle, Some(g)) => {
+                Some(Self::Attached((AttachmentMode::Single, Generation::new(g))))
+            }
+            (LocationConfigMode::AttachedStale, Some(g)) => {
+                Some(Self::Attached((AttachmentMode::Stale, Generation::new(g))))
+            }
+            _ => {
+                tracing::warn!(
+                    "Received invalid re-attach state for tenant {}: {rart:?}",
+                    rart.id
+                );
+                None
+            }
+        }
+    }
+}
+
 impl TenantsMap {
     /// Convenience function for typical usage, where we want to get a `Tenant` object, for
     /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
@@ -271,7 +311,7 @@ pub struct TenantManager {
 
 fn emergency_generations(
     tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
-) -> HashMap<TenantShardId, Generation> {
+) -> HashMap<TenantShardId, TenantStartupMode> {
     tenant_confs
         .iter()
         .filter_map(|(tid, lc)| {
@@ -279,12 +319,15 @@ fn emergency_generations(
                 Ok(lc) => lc,
                 Err(_) => return None,
             };
-            let gen = match &lc.mode {
-                LocationMode::Attached(alc) => Some(alc.generation),
-                LocationMode::Secondary(_) => None,
-            };
-
-            gen.map(|g| (*tid, g))
+            Some((
+                *tid,
+                match &lc.mode {
+                    LocationMode::Attached(alc) => {
+                        TenantStartupMode::Attached((alc.attach_mode, alc.generation))
+                    }
+                    LocationMode::Secondary(_) => TenantStartupMode::Secondary,
+                },
+            ))
         })
         .collect()
 }
@@ -294,7 +337,7 @@ async fn init_load_generations(
     tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
     resources: &TenantSharedResources,
     cancel: &CancellationToken,
-) -> anyhow::Result<Option<HashMap<TenantShardId, Generation>>> {
+) -> anyhow::Result<Option<HashMap<TenantShardId, TenantStartupMode>>> {
     let generations = if conf.control_plane_emergency_mode {
         error!(
             "Emergency mode!  Tenants will be attached unsafely using their last known generation"
@@ -304,7 +347,12 @@ async fn init_load_generations(
         info!("Calling control plane API to re-attach tenants");
         // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
         match client.re_attach(conf).await {
-            Ok(tenants) => tenants,
+            Ok(tenants) => tenants
+                .into_iter()
+                .flat_map(|(id, rart)| {
+                    TenantStartupMode::from_reattach_tenant(rart).map(|tsm| (id, tsm))
+                })
+                .collect(),
             Err(RetryForeverError::ShuttingDown) => {
                 anyhow::bail!("Shut down while waiting for control plane re-attach response")
             }
@@ -322,9 +370,17 @@ async fn init_load_generations(
     // Must only do this if remote storage is enabled, otherwise deletion queue
     // is not running and channel push will fail.
     if resources.remote_storage.is_some() {
-        resources
-            .deletion_queue_client
-            .recover(generations.clone())?;
+        let attached_tenants = generations
+            .iter()
+            .flat_map(|(id, start_mode)| {
+                match start_mode {
+                    TenantStartupMode::Attached((_mode, generation)) => Some(generation),
+                    TenantStartupMode::Secondary => None,
+                }
+                .map(|gen| (*id, *gen))
+            })
+            .collect();
+        resources.deletion_queue_client.recover(attached_tenants)?;
     }
 
     Ok(Some(generations))
@@ -490,9 +546,8 @@ pub async fn init_tenant_mgr(
     // Scan local filesystem for attached tenants
     let tenant_configs = init_load_tenant_configs(conf).await?;
 
-    // Determine which tenants are to be attached
-    let tenant_generations =
-        init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
+    // Determine which tenants are to be secondary or attached, and in which generation
+    let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
 
     tracing::info!(
         "Attaching {} tenants at startup, warming up {} at a time",
@@ -522,97 +577,102 @@ pub async fn init_tenant_mgr(
             }
         };
 
-        let generation = if let Some(generations) = &tenant_generations {
+        // FIXME: if we were attached, and get demoted to secondary on re-attach, we
+        // don't have a place to get a config.
+        // (https://github.com/neondatabase/neon/issues/5377)
+        const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig =
+            SecondaryLocationConfig { warm: true };
+
+        // Update the location config according to the re-attach response
+        if let Some(tenant_modes) = &tenant_modes {
             // We have a generation map: treat it as the authority for whether
             // this tenant is really attached.
-            if let Some(gen) = generations.get(&tenant_shard_id) {
-                if let LocationMode::Attached(attached) = &location_conf.mode {
-                    if attached.generation > *gen {
+            match tenant_modes.get(&tenant_shard_id) {
+                None => {
+                    info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
+                    if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
+                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                            "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
+                        );
+                    }
+
+                    // We deleted local content: move on to next tenant, don't try and spawn this one.
+                    continue;
+                }
+                Some(TenantStartupMode::Secondary) => {
+                    if !matches!(location_conf.mode, LocationMode::Secondary(_)) {
+                        location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF);
+                    }
+                }
+                Some(TenantStartupMode::Attached((attach_mode, generation))) => {
+                    let old_gen_higher = match &location_conf.mode {
+                        LocationMode::Attached(AttachedLocationConfig {
+                            generation: old_generation,
+                            attach_mode: _attach_mode,
+                        }) => {
+                            if old_generation > generation {
+                                Some(old_generation)
+                            } else {
+                                None
+                            }
+                        }
+                        _ => None,
+                    };
+                    if let Some(old_generation) = old_gen_higher {
                         tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                            "Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
-                            attached.generation
+                            "Control plane gave decreasing generation ({generation:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
+                            old_generation
                         );
 
                         // We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away
                         // local disk content: demote to secondary rather than detaching.
-                        tenants.insert(
-                            tenant_shard_id,
-                            TenantSlot::Secondary(SecondaryTenant::new(
-                                tenant_shard_id,
-                                location_conf.shard,
-                                location_conf.tenant_conf.clone(),
-                                &SecondaryLocationConfig { warm: false },
-                            )),
-                        );
+                        location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF);
+                    } else {
+                        location_conf.attach_in_generation(*attach_mode, *generation);
                     }
                 }
-                *gen
-            } else {
-                match &location_conf.mode {
-                    LocationMode::Secondary(secondary_config) => {
-                        // We do not require the control plane's permission for secondary mode
-                        // tenants, because they do no remote writes and hence require no
-                        // generation number
-                        info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
-                        tenants.insert(
-                            tenant_shard_id,
-                            TenantSlot::Secondary(SecondaryTenant::new(
-                                tenant_shard_id,
-                                location_conf.shard,
-                                location_conf.tenant_conf,
-                                secondary_config,
-                            )),
-                        );
-                    }
-                    LocationMode::Attached(_) => {
-                        // TODO: augment re-attach API to enable the control plane to
-                        // instruct us about secondary attachments.  That way, instead of throwing
-                        // away local state, we can gracefully fall back to secondary here, if the control
-                        // plane tells us so.
-                        // (https://github.com/neondatabase/neon/issues/5377)
-                        info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
-                        if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
-                            error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                                "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
-                            );
-                        }
-                    }
-                };
-
-                continue;
             }
         } else {
             // Legacy mode: no generation information, any tenant present
             // on local disk may activate
             info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Starting tenant in legacy mode, no generation",);
-            Generation::none()
         };
 
         // Presence of a generation number implies attachment: attach the tenant
         // if it wasn't already, and apply the generation number.
-        location_conf.attach_in_generation(generation);
         Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
 
         let shard_identity = location_conf.shard;
-        match tenant_spawn(
-            conf,
-            tenant_shard_id,
-            &tenant_dir_path,
-            resources.clone(),
-            AttachedTenantConf::try_from(location_conf)?,
-            shard_identity,
-            Some(init_order.clone()),
-            &TENANTS,
-            SpawnMode::Lazy,
-            &ctx,
-        ) {
-            Ok(tenant) => {
-                tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant));
+        let slot = match location_conf.mode {
+            LocationMode::Attached(attached_conf) => {
+                match tenant_spawn(
+                    conf,
+                    tenant_shard_id,
+                    &tenant_dir_path,
+                    resources.clone(),
+                    AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
+                    shard_identity,
+                    Some(init_order.clone()),
+                    &TENANTS,
+                    SpawnMode::Lazy,
+                    &ctx,
+                ) {
+                    Ok(tenant) => TenantSlot::Attached(tenant),
+                    Err(e) => {
+                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
+                        continue;
+                    }
+                }
             }
-            Err(e) => {
-                error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
-            }
-        }
+            LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new(
+                tenant_shard_id,
+                shard_identity,
+                location_conf.tenant_conf,
+                &secondary_conf,
+            )),
+        };
+
+        tenants.insert(tenant_shard_id, slot);
     }
 
     info!("Processed {} local tenants at startup", tenants.len());
@@ -1662,9 +1722,9 @@ impl TenantManager {
                     .layers
                     .read()
                     .await
-                    .resident_layers()
-                    .collect::<Vec<_>>()
-                    .await;
+                    .likely_resident_layers()
+                    .collect::<Vec<_>>();
+
                 for layer in timeline_layers {
                     let relative_path = layer
                         .local_path()
@@ -2143,7 +2203,7 @@ pub(crate) async fn load_tenant(
 
     let mut location_conf =
         Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?;
-    location_conf.attach_in_generation(generation);
+    location_conf.attach_in_generation(AttachmentMode::Single, generation);
 
     Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
 
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 82af7ed83b..40f19e3b05 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -11,6 +11,7 @@ use crate::{
     disk_usage_eviction_task::{
         finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer,
     },
+    is_temporary,
     metrics::SECONDARY_MODE,
     tenant::{
         config::SecondaryLocationConfig,
@@ -961,7 +962,10 @@ async fn init_timeline_state(
             // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
             warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
             continue;
-        } else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) {
+        } else if crate::is_temporary(&file_path)
+            || is_temp_download_file(&file_path)
+            || is_temporary(&file_path)
+        {
             // Temporary files are frequently left behind from restarting during downloads
             tracing::info!("Cleaning up temporary file {file_path}");
             if let Err(e) = tokio::fs::remove_file(&file_path)
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index eed423c3e6..8ba37b5a86 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -32,6 +32,9 @@ use utils::generation::Generation;
 #[cfg(test)]
 mod tests;
 
+#[cfg(test)]
+mod failpoints;
+
 /// A Layer contains all data in a "rectangle" consisting of a range of keys and
 /// range of LSNs.
 ///
@@ -46,7 +49,41 @@ mod tests;
 /// An image layer is a snapshot of all the data in a key-range, at a single
 /// LSN.
 ///
-/// This type models the on-disk layers, which can be evicted and on-demand downloaded.
+/// This type models the on-disk layers, which can be evicted and on-demand downloaded. As a
+/// general goal, read accesses should always win eviction and eviction should not wait for
+/// download.
+///
+/// ### State transitions
+///
+/// The internal state of `Layer` is composed of most importantly the on-filesystem state and the
+/// [`ResidentOrWantedEvicted`] enum. On-filesystem state can be either present (fully downloaded,
+/// right size) or deleted.
+///
+/// Reads will always win requests to evict until `wait_for_turn_and_evict` has acquired the
+/// `heavier_once_cell::InitPermit` and has started to `evict_blocking`. Before the
+/// `heavier_once_cell::InitPermit` has been acquired, any read request
+/// (`get_or_maybe_download`) can "re-initialize" using the existing downloaded file and thus
+/// cancelling the eviction.
+///
+/// ```text
+///  +-----------------+   get_or_maybe_download    +--------------------------------+
+///  | not initialized |--------------------------->| Resident(Arc<DownloadedLayer>) |
+///  |     ENOENT      |                         /->|                                |
+///  +-----------------+                         |  +--------------------------------+
+///                  ^                           |                         |       ^
+///                  |    get_or_maybe_download  |                         |       | get_or_maybe_download, either:
+///   evict_blocking | /-------------------------/                         |       | - upgrade weak to strong
+///                  | |                                                   |       | - re-initialize without download
+///                  | |                                    evict_and_wait |       |
+///  +-----------------+                                                   v       |
+///  | not initialized |  on_downloaded_layer_drop  +--------------------------------------+
+///  | file is present |<---------------------------| WantedEvicted(Weak<DownloadedLayer>) |
+///  +-----------------+                            +--------------------------------------+
+/// ```
+///
+/// ### Unsupported
+///
+/// - Evicting by the operator deleting files from the filesystem
 ///
 /// [`InMemoryLayer`]: super::inmemory_layer::InMemoryLayer
 #[derive(Clone)]
@@ -211,8 +248,7 @@ impl Layer {
     ///
     /// Timeout is mandatory, because waiting for eviction is only needed for our tests; eviction
     /// will happen regardless the future returned by this method completing unless there is a
-    /// read access (currently including [`Layer::keep_resident`]) before eviction gets to
-    /// complete.
+    /// read access before eviction gets to complete.
     ///
     /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation
     /// of download-evict cycle on retry.
@@ -307,21 +343,28 @@ impl Layer {
     /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction
     /// while the guard exists.
     ///
-    /// Returns None if the layer is currently evicted.
-    pub(crate) async fn keep_resident(&self) -> anyhow::Result<Option<ResidentLayer>> {
-        let downloaded = match self.0.get_or_maybe_download(false, None).await {
-            Ok(d) => d,
-            // technically there are a lot of possible errors, but in practice it should only be
-            // DownloadRequired which is tripped up. could work to improve this situation
-            // statically later.
-            Err(DownloadError::DownloadRequired) => return Ok(None),
-            Err(e) => return Err(e.into()),
-        };
+    /// Returns None if the layer is currently evicted or becoming evicted.
+    #[cfg(test)]
+    pub(crate) async fn keep_resident(&self) -> Option<ResidentLayer> {
+        let downloaded = self.0.inner.get().and_then(|rowe| rowe.get())?;
 
-        Ok(Some(ResidentLayer {
+        Some(ResidentLayer {
             downloaded,
             owner: self.clone(),
-        }))
+        })
+    }
+
+    /// Weak indicator of is the layer resident or not. Good enough for eviction, which can deal
+    /// with `EvictionError::NotFound`.
+    ///
+    /// Returns `true` if this layer might be resident, or `false`, if it most likely evicted or
+    /// will be unless a read happens soon.
+    pub(crate) fn is_likely_resident(&self) -> bool {
+        self.0
+            .inner
+            .get()
+            .map(|rowe| rowe.is_likely_resident())
+            .unwrap_or(false)
     }
 
     /// Downloads if necessary and creates a guard, which will keep this layer from being evicted.
@@ -371,11 +414,11 @@ impl Layer {
     /// separatedly.
     #[cfg(any(feature = "testing", test))]
     pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
-        let mut rx = self.0.status.subscribe();
+        let mut rx = self.0.status.as_ref().unwrap().subscribe();
 
         async move {
             loop {
-                if let Err(tokio::sync::broadcast::error::RecvError::Closed) = rx.recv().await {
+                if rx.changed().await.is_err() {
                     break;
                 }
             }
@@ -397,6 +440,32 @@ enum ResidentOrWantedEvicted {
 }
 
 impl ResidentOrWantedEvicted {
+    /// Non-mutating access to the a DownloadedLayer, if possible.
+    ///
+    /// This is not used on the read path (anything that calls
+    /// [`LayerInner::get_or_maybe_download`]) because it was decided that reads always win
+    /// evictions, and part of that winning is using [`ResidentOrWantedEvicted::get_and_upgrade`].
+    #[cfg(test)]
+    fn get(&self) -> Option<Arc<DownloadedLayer>> {
+        match self {
+            ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()),
+            ResidentOrWantedEvicted::WantedEvicted(weak, _) => weak.upgrade(),
+        }
+    }
+
+    /// Best-effort query for residency right now, not as strong guarantee as receiving a strong
+    /// reference from `ResidentOrWantedEvicted::get`.
+    fn is_likely_resident(&self) -> bool {
+        match self {
+            ResidentOrWantedEvicted::Resident(_) => true,
+            ResidentOrWantedEvicted::WantedEvicted(weak, _) => weak.strong_count() > 0,
+        }
+    }
+
+    /// Upgrades any weak to strong if possible.
+    ///
+    /// Returns a strong reference if possible, along with a boolean telling if an upgrade
+    /// happened.
     fn get_and_upgrade(&mut self) -> Option<(Arc<DownloadedLayer>, bool)> {
         match self {
             ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)),
@@ -417,7 +486,7 @@ impl ResidentOrWantedEvicted {
     ///
     /// Returns `Some` if this was the first time eviction was requested. Care should be taken to
     /// drop the possibly last strong reference outside of the mutex of
-    /// heavier_once_cell::OnceCell.
+    /// [`heavier_once_cell::OnceCell`].
     fn downgrade(&mut self) -> Option<Arc<DownloadedLayer>> {
         match self {
             ResidentOrWantedEvicted::Resident(strong) => {
@@ -445,6 +514,9 @@ struct LayerInner {
     desc: PersistentLayerDesc,
 
     /// Timeline access is needed for remote timeline client and metrics.
+    ///
+    /// There should not be an access to timeline for any reason without entering the
+    /// [`Timeline::gate`] at the same time.
     timeline: Weak<Timeline>,
 
     /// Cached knowledge of [`Timeline::remote_client`] being `Some`.
@@ -453,27 +525,38 @@ struct LayerInner {
     access_stats: LayerAccessStats,
 
     /// This custom OnceCell is backed by std mutex, but only held for short time periods.
-    /// Initialization and deinitialization are done while holding a permit.
+    ///
+    /// Filesystem changes (download, evict) are only done while holding a permit which the
+    /// `heavier_once_cell` provides.
+    ///
+    /// A number of fields in `Layer` are meant to only be updated when holding the InitPermit, but
+    /// possibly read while not holding it.
     inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,
 
     /// Do we want to delete locally and remotely this when `LayerInner` is dropped
     wanted_deleted: AtomicBool,
 
-    /// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses
-    /// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger
-    /// [`LayerInner::on_downloaded_layer_drop`].
-    wanted_evicted: AtomicBool,
-
-    /// Version is to make sure we will only evict a specific download of a file.
+    /// Version is to make sure we will only evict a specific initialization of the downloaded file.
     ///
-    /// Incremented for each download, stored in `DownloadedLayer::version` or
+    /// Incremented for each initialization, stored in `DownloadedLayer::version` or
     /// `ResidentOrWantedEvicted::WantedEvicted`.
     version: AtomicUsize,
 
-    /// Allow subscribing to when the layer actually gets evicted.
-    status: tokio::sync::broadcast::Sender<Status>,
+    /// Allow subscribing to when the layer actually gets evicted, a non-cancellable download
+    /// starts, or completes.
+    ///
+    /// Updates must only be posted while holding the InitPermit or the heavier_once_cell::Guard.
+    /// Holding the InitPermit is the only time we can do state transitions, but we also need to
+    /// cancel a pending eviction on upgrading a [`ResidentOrWantedEvicted::WantedEvicted`] back to
+    /// [`ResidentOrWantedEvicted::Resident`] on access.
+    ///
+    /// The sender is wrapped in an Option to facilitate moving it out on [`LayerInner::drop`].
+    status: Option<tokio::sync::watch::Sender<Status>>,
 
-    /// Counter for exponential backoff with the download
+    /// Counter for exponential backoff with the download.
+    ///
+    /// This is atomic only for the purposes of having additional data only accessed while holding
+    /// the InitPermit.
     consecutive_failures: AtomicUsize,
 
     /// The generation of this Layer.
@@ -491,7 +574,13 @@ struct LayerInner {
     /// a shard split since the layer was originally written.
     shard: ShardIndex,
 
+    /// When the Layer was last evicted but has not been downloaded since.
+    ///
+    /// This is used solely for updating metrics. See [`LayerImplMetrics::redownload_after`].
     last_evicted_at: std::sync::Mutex<Option<std::time::Instant>>,
+
+    #[cfg(test)]
+    failpoints: std::sync::Mutex<Vec<failpoints::Failpoint>>,
 }
 
 impl std::fmt::Display for LayerInner {
@@ -508,16 +597,16 @@ impl AsLayerDesc for LayerInner {
 
 #[derive(Debug, Clone, Copy)]
 enum Status {
+    Resident,
     Evicted,
-    Downloaded,
+    Downloading,
 }
 
 impl Drop for LayerInner {
     fn drop(&mut self) {
         if !*self.wanted_deleted.get_mut() {
-            // should we try to evict if the last wish was for eviction?
-            // feels like there's some hazard of overcrowding near shutdown near by, but we don't
-            // run drops during shutdown (yet)
+            // should we try to evict if the last wish was for eviction? seems more like a hazard
+            // than a clear win.
             return;
         }
 
@@ -528,9 +617,9 @@ impl Drop for LayerInner {
         let file_size = self.layer_desc().file_size;
         let timeline = self.timeline.clone();
         let meta = self.metadata();
-        let status = self.status.clone();
+        let status = self.status.take();
 
-        crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
+        Self::spawn_blocking(move || {
             let _g = span.entered();
 
             // carry this until we are finished for [`Layer::wait_drop`] support
@@ -605,12 +694,16 @@ impl LayerInner {
             .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id)
             .join(desc.filename().to_string());
 
-        let (inner, version) = if let Some(inner) = downloaded {
+        let (inner, version, init_status) = if let Some(inner) = downloaded {
             let version = inner.version;
             let resident = ResidentOrWantedEvicted::Resident(inner);
-            (heavier_once_cell::OnceCell::new(resident), version)
+            (
+                heavier_once_cell::OnceCell::new(resident),
+                version,
+                Status::Resident,
+            )
         } else {
-            (heavier_once_cell::OnceCell::default(), 0)
+            (heavier_once_cell::OnceCell::default(), 0, Status::Evicted)
         };
 
         LayerInner {
@@ -621,14 +714,15 @@ impl LayerInner {
             have_remote_client: timeline.remote_client.is_some(),
             access_stats,
             wanted_deleted: AtomicBool::new(false),
-            wanted_evicted: AtomicBool::new(false),
             inner,
             version: AtomicUsize::new(version),
-            status: tokio::sync::broadcast::channel(1).0,
+            status: Some(tokio::sync::watch::channel(init_status).0),
             consecutive_failures: AtomicUsize::new(0),
             generation,
             shard,
             last_evicted_at: std::sync::Mutex::default(),
+            #[cfg(test)]
+            failpoints: Default::default(),
         }
     }
 
@@ -644,20 +738,34 @@ impl LayerInner {
 
     /// Cancellation safe, however dropping the future and calling this method again might result
     /// in a new attempt to evict OR join the previously started attempt.
+    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret, err(level = tracing::Level::DEBUG), fields(layer=%self))]
     pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
-        use tokio::sync::broadcast::error::RecvError;
-
         assert!(self.have_remote_client);
 
-        let mut rx = self.status.subscribe();
+        let mut rx = self.status.as_ref().unwrap().subscribe();
+
+        {
+            let current = rx.borrow_and_update();
+            match &*current {
+                Status::Resident => {
+                    // we might get lucky and evict this; continue
+                }
+                Status::Evicted | Status::Downloading => {
+                    // it is already evicted
+                    return Err(EvictionError::NotFound);
+                }
+            }
+        }
 
         let strong = {
             match self.inner.get() {
-                Some(mut either) => {
-                    self.wanted_evicted.store(true, Ordering::Relaxed);
-                    either.downgrade()
+                Some(mut either) => either.downgrade(),
+                None => {
+                    // we already have a scheduled eviction, which just has not gotten to run yet.
+                    // it might still race with a read access, but that could also get cancelled,
+                    // so let's say this is not evictable.
+                    return Err(EvictionError::NotFound);
                 }
-                None => return Err(EvictionError::NotFound),
             }
         };
 
@@ -673,26 +781,26 @@ impl LayerInner {
             LAYER_IMPL_METRICS.inc_started_evictions();
         }
 
-        match tokio::time::timeout(timeout, rx.recv()).await {
-            Ok(Ok(Status::Evicted)) => Ok(()),
-            Ok(Ok(Status::Downloaded)) => Err(EvictionError::Downloaded),
-            Ok(Err(RecvError::Closed)) => {
-                unreachable!("sender cannot be dropped while we are in &self method")
-            }
-            Ok(Err(RecvError::Lagged(_))) => {
-                // this is quite unlikely, but we are blocking a lot in the async context, so
-                // we might be missing this because we are stuck on a LIFO slot on a thread
-                // which is busy blocking for a 1TB database create_image_layers.
-                //
-                // use however late (compared to the initial expressing of wanted) as the
-                // "outcome" now
-                LAYER_IMPL_METRICS.inc_broadcast_lagged();
-                match self.inner.get() {
-                    Some(_) => Err(EvictionError::Downloaded),
-                    None => Ok(()),
-                }
-            }
-            Err(_timeout) => Err(EvictionError::Timeout),
+        let changed = rx.changed();
+        let changed = tokio::time::timeout(timeout, changed).await;
+
+        let Ok(changed) = changed else {
+            return Err(EvictionError::Timeout);
+        };
+
+        let _: () = changed.expect("cannot be closed, because we are holding a strong reference");
+
+        let current = rx.borrow_and_update();
+
+        match &*current {
+            // the easiest case
+            Status::Evicted => Ok(()),
+            // it surely was evicted in between, but then there was a new access now; we can't know
+            // if it'll succeed so lets just call it evicted
+            Status::Downloading => Ok(()),
+            // either the download which was started after eviction completed already, or it was
+            // never evicted
+            Status::Resident => Err(EvictionError::Downloaded),
         }
     }
 
@@ -703,32 +811,37 @@ impl LayerInner {
         ctx: Option<&RequestContext>,
     ) -> Result<Arc<DownloadedLayer>, DownloadError> {
         let (weak, permit) = {
+            // get_or_init_detached can:
+            // - be fast (mutex lock) OR uncontested semaphore permit acquire
+            // - be slow (wait for semaphore permit or closing)
+            let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+
             let locked = self
                 .inner
                 .get_or_init_detached()
                 .await
                 .map(|mut guard| guard.get_and_upgrade().ok_or(guard));
 
+            scopeguard::ScopeGuard::into_inner(init_cancelled);
+
             match locked {
                 // this path could had been a RwLock::read
                 Ok(Ok((strong, upgraded))) if !upgraded => return Ok(strong),
                 Ok(Ok((strong, _))) => {
                     // when upgraded back, the Arc<DownloadedLayer> is still available, but
-                    // previously a `evict_and_wait` was received.
-                    self.wanted_evicted.store(false, Ordering::Relaxed);
-
-                    // error out any `evict_and_wait`
-                    drop(self.status.send(Status::Downloaded));
+                    // previously a `evict_and_wait` was received. this is the only place when we
+                    // send out an update without holding the InitPermit.
+                    //
+                    // note that we also have dropped the Guard; this is fine, because we just made
+                    // a state change and are holding a strong reference to be returned.
+                    self.status.as_ref().unwrap().send_replace(Status::Resident);
                     LAYER_IMPL_METRICS
                         .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
 
                     return Ok(strong);
                 }
-                Ok(Err(mut guard)) => {
-                    // path to here: the evict_blocking is stuck on spawn_blocking queue.
-                    //
-                    // reset the contents, deactivating the eviction and causing a
-                    // EvictionCancelled::LostToDownload or EvictionCancelled::VersionCheckFailed.
+                Ok(Err(guard)) => {
+                    // path to here: we won the eviction, the file should still be on the disk.
                     let (weak, permit) = guard.take_and_deinit();
                     (Some(weak), permit)
                 }
@@ -744,86 +857,66 @@ impl LayerInner {
             );
         }
 
+        let timeline = self
+            .timeline
+            .upgrade()
+            .ok_or_else(|| DownloadError::TimelineShutdown)?;
+
+        // count cancellations, which currently remain largely unexpected
+        let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+
+        // check if we really need to be downloaded: this can happen if a read access won the
+        // semaphore before eviction.
+        //
+        // if we are cancelled while doing this `stat` the `self.inner` will be uninitialized. a
+        // pending eviction will try to evict even upon finding an uninitialized `self.inner`.
+        let needs_download = self
+            .needs_download()
+            .await
+            .map_err(DownloadError::PreStatFailed);
+
+        scopeguard::ScopeGuard::into_inner(init_cancelled);
+
+        let needs_download = needs_download?;
+
+        let Some(reason) = needs_download else {
+            // the file is present locally because eviction has not had a chance to run yet
+
+            #[cfg(test)]
+            self.failpoint(failpoints::FailpointKind::AfterDeterminingLayerNeedsNoDownload)
+                .await?;
+
+            LAYER_IMPL_METRICS.inc_init_needed_no_download();
+
+            return Ok(self.initialize_after_layer_is_on_disk(permit));
+        };
+
+        // we must download; getting cancelled before spawning the download is not an issue as
+        // any still running eviction would not find anything to evict.
+
+        if let NeedsDownload::NotFile(ft) = reason {
+            return Err(DownloadError::NotFile(ft));
+        }
+
+        if timeline.remote_client.as_ref().is_none() {
+            return Err(DownloadError::NoRemoteStorage);
+        }
+
+        if let Some(ctx) = ctx {
+            self.check_expected_download(ctx)?;
+        }
+
+        if !allow_download {
+            // this is only used from tests, but it is hard to test without the boolean
+            return Err(DownloadError::DownloadRequired);
+        }
+
         async move {
-            // disable any scheduled but not yet running eviction deletions for this
-            let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
-
-            // no need to make the evict_and_wait wait for the actual download to complete
-            drop(self.status.send(Status::Downloaded));
-
-            let timeline = self
-                .timeline
-                .upgrade()
-                .ok_or_else(|| DownloadError::TimelineShutdown)?;
-
-            // count cancellations, which currently remain largely unexpected
-            let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
-
-            // check if we really need to be downloaded; could have been already downloaded by a
-            // cancelled previous attempt.
-            let needs_download = self
-                .needs_download()
-                .await
-                .map_err(DownloadError::PreStatFailed);
-
-            let needs_download = match needs_download {
-                Ok(reason) => reason,
-                Err(e) => {
-                    scopeguard::ScopeGuard::into_inner(init_cancelled);
-                    return Err(e);
-                }
-            };
-
-            let Some(reason) = needs_download else {
-                scopeguard::ScopeGuard::into_inner(init_cancelled);
-
-                // the file is present locally, probably by a previous but cancelled call to
-                // get_or_maybe_download. alternatively we might be running without remote storage.
-                LAYER_IMPL_METRICS.inc_init_needed_no_download();
-
-                let res = self.initialize_after_layer_is_on_disk(next_version, permit, false);
-                return Ok(res);
-            };
-
-            if let NeedsDownload::NotFile(ft) = reason {
-                scopeguard::ScopeGuard::into_inner(init_cancelled);
-                return Err(DownloadError::NotFile(ft));
-            }
-
-            // only reset this after we've decided we really need to download. otherwise it'd
-            // be impossible to mark cancelled downloads for eviction, like one could imagine
-            // we would like to do for prefetching which was not needed.
-            self.wanted_evicted.store(false, Ordering::Release);
-
-            if timeline.remote_client.as_ref().is_none() {
-                scopeguard::ScopeGuard::into_inner(init_cancelled);
-                return Err(DownloadError::NoRemoteStorage);
-            }
-
-            if let Some(ctx) = ctx {
-                let res = self.check_expected_download(ctx);
-                if let Err(e) = res {
-                    scopeguard::ScopeGuard::into_inner(init_cancelled);
-                    return Err(e);
-                }
-            }
-
-            if !allow_download {
-                // this does look weird, but for LayerInner the "downloading" means also changing
-                // internal once related state ...
-                scopeguard::ScopeGuard::into_inner(init_cancelled);
-                return Err(DownloadError::DownloadRequired);
-            }
-
             tracing::info!(%reason, "downloading on-demand");
 
-            let permit = self.spawn_download_and_wait(timeline, permit).await;
-
+            let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+            let res = self.download_init_and_wait(timeline, permit).await?;
             scopeguard::ScopeGuard::into_inner(init_cancelled);
-
-            let permit = permit?;
-
-            let res = self.initialize_after_layer_is_on_disk(next_version, permit, true);
             Ok(res)
         }
         .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
@@ -857,11 +950,11 @@ impl LayerInner {
     }
 
     /// Actual download, at most one is executed at the time.
-    async fn spawn_download_and_wait(
+    async fn download_init_and_wait(
         self: &Arc<Self>,
         timeline: Arc<Timeline>,
         permit: heavier_once_cell::InitPermit,
-    ) -> Result<heavier_once_cell::InitPermit, DownloadError> {
+    ) -> Result<Arc<DownloadedLayer>, DownloadError> {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
         let (tx, rx) = tokio::sync::oneshot::channel();
@@ -873,66 +966,35 @@ impl LayerInner {
             .enter()
             .map_err(|_| DownloadError::DownloadCancelled)?;
 
-        tokio::task::spawn(async move {
-
+        Self::spawn(
+            async move {
                 let _guard = guard;
 
-                let client = timeline
-                    .remote_client
+                // now that we have commited to downloading, send out an update to:
+                // - unhang any pending eviction
+                // - break out of evict_and_wait
+                this.status
                     .as_ref()
-                    .expect("checked above with have_remote_client");
+                    .unwrap()
+                    .send_replace(Status::Downloading);
 
-                let result = client.download_layer_file(
-                    &this.desc.filename(),
-                    &this.metadata(),
-                    &timeline.cancel
-                )
-                .await;
+                #[cfg(test)]
+                this.failpoint(failpoints::FailpointKind::WaitBeforeDownloading)
+                    .await
+                    .unwrap();
 
-                let result = match result {
-                    Ok(size) => {
-                        timeline.metrics.resident_physical_size_add(size);
-                        Ok(())
-                    }
-                    Err(e) => {
-                        let consecutive_failures =
-                            this.consecutive_failures.fetch_add(1, Ordering::Relaxed);
+                let res = this.download_and_init(timeline, permit).await;
 
-                        let backoff = utils::backoff::exponential_backoff_duration_seconds(
-                            consecutive_failures.min(u32::MAX as usize) as u32,
-                            1.5,
-                            60.0,
-                        );
-
-                        let backoff = std::time::Duration::from_secs_f64(backoff);
-
-                        tokio::select! {
-                            _ = tokio::time::sleep(backoff) => {},
-                            _ = timeline.cancel.cancelled() => {},
-                        };
-
-                        Err(e)
-                    }
-                };
-
-                if let Err(res) = tx.send((result, permit)) {
+                if let Err(res) = tx.send(res) {
                     match res {
-                        (Ok(()), _) => {
-                            // our caller is cancellation safe so this is fine; if someone
-                            // else requests the layer, they'll find it already downloaded.
-                            //
-                            // See counter [`LayerImplMetrics::inc_init_needed_no_download`]
-                            //
-                            // FIXME(#6028): however, could be that we should consider marking the
-                            // layer for eviction? alas, cannot: because only DownloadedLayer will
-                            // handle that.
-                        },
-                        (Err(e), _) => {
-                            // our caller is cancellation safe, but we might be racing with
-                            // another attempt to initialize. before we have cancellation
-                            // token support: these attempts should converge regardless of
-                            // their completion order.
-                            tracing::error!("layer file download failed, and additionally failed to communicate this to caller: {e:?}");
+                        Ok(_res) => {
+                            tracing::debug!("layer initialized, but caller has been cancelled");
+                            LAYER_IMPL_METRICS.inc_init_completed_without_requester();
+                        }
+                        Err(e) => {
+                            tracing::info!(
+                                "layer file download failed, and caller has been cancelled: {e:?}"
+                            );
                             LAYER_IMPL_METRICS.inc_download_failed_without_requester();
                         }
                     }
@@ -942,41 +1004,102 @@ impl LayerInner {
         );
 
         match rx.await {
-            Ok((Ok(()), permit)) => {
-                if let Some(reason) = self
-                    .needs_download()
-                    .await
-                    .map_err(DownloadError::PostStatFailed)?
-                {
-                    // this is really a bug in needs_download or remote timeline client
-                    panic!("post-condition failed: needs_download returned {reason:?}");
-                }
-
-                self.consecutive_failures.store(0, Ordering::Relaxed);
-                tracing::info!(size=%self.desc.file_size, "on-demand download successful");
-
-                Ok(permit)
-            }
-            Ok((Err(e), _permit)) => {
+            Ok(Ok(res)) => Ok(res),
+            Ok(Err(e)) => {
                 // sleep already happened in the spawned task, if it was not cancelled
-                let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed);
-
                 match e.downcast_ref::<remote_storage::DownloadError>() {
                     // If the download failed due to its cancellation token,
                     // propagate the cancellation error upstream.
                     Some(remote_storage::DownloadError::Cancelled) => {
                         Err(DownloadError::DownloadCancelled)
                     }
-                    _ => {
-                        tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
-                        Err(DownloadError::DownloadFailed)
-                    }
+                    // FIXME: this is not embedding the error because historically it would had
+                    // been output to compute, however that is no longer the case.
+                    _ => Err(DownloadError::DownloadFailed),
                 }
             }
             Err(_gone) => Err(DownloadError::DownloadCancelled),
         }
     }
 
+    async fn download_and_init(
+        self: &Arc<LayerInner>,
+        timeline: Arc<Timeline>,
+        permit: heavier_once_cell::InitPermit,
+    ) -> anyhow::Result<Arc<DownloadedLayer>> {
+        let client = timeline
+            .remote_client
+            .as_ref()
+            .expect("checked before download_init_and_wait");
+
+        let result = client
+            .download_layer_file(&self.desc.filename(), &self.metadata(), &timeline.cancel)
+            .await;
+
+        match result {
+            Ok(size) => {
+                assert_eq!(size, self.desc.file_size);
+
+                match self.needs_download().await {
+                    Ok(Some(reason)) => {
+                        // this is really a bug in needs_download or remote timeline client
+                        panic!("post-condition failed: needs_download returned {reason:?}");
+                    }
+                    Ok(None) => {
+                        // as expected
+                    }
+                    Err(e) => {
+                        panic!("post-condition failed: needs_download errored: {e:?}");
+                    }
+                }
+
+                tracing::info!(size=%self.desc.file_size, "on-demand download successful");
+                timeline
+                    .metrics
+                    .resident_physical_size_add(self.desc.file_size);
+                self.consecutive_failures.store(0, Ordering::Relaxed);
+
+                let since_last_eviction = self
+                    .last_evicted_at
+                    .lock()
+                    .unwrap()
+                    .take()
+                    .map(|ts| ts.elapsed());
+                if let Some(since_last_eviction) = since_last_eviction {
+                    LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
+                }
+
+                self.access_stats.record_residence_event(
+                    LayerResidenceStatus::Resident,
+                    LayerResidenceEventReason::ResidenceChange,
+                );
+
+                Ok(self.initialize_after_layer_is_on_disk(permit))
+            }
+            Err(e) => {
+                let consecutive_failures =
+                    1 + self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
+
+                tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
+
+                let backoff = utils::backoff::exponential_backoff_duration_seconds(
+                    consecutive_failures.min(u32::MAX as usize) as u32,
+                    1.5,
+                    60.0,
+                );
+
+                let backoff = std::time::Duration::from_secs_f64(backoff);
+
+                tokio::select! {
+                    _ = tokio::time::sleep(backoff) => {},
+                    _ = timeline.cancel.cancelled() => {},
+                };
+
+                Err(e)
+            }
+        }
+    }
+
     /// Initializes the `Self::inner` to a "resident" state.
     ///
     /// Callers are assumed to ensure that the file is actually on disk with `Self::needs_download`
@@ -986,25 +1109,13 @@ impl LayerInner {
     /// changes are made before we can write to the OnceCell in non-cancellable fashion.
     fn initialize_after_layer_is_on_disk(
         self: &Arc<LayerInner>,
-        next_version: usize,
         permit: heavier_once_cell::InitPermit,
-        downloaded: bool,
     ) -> Arc<DownloadedLayer> {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
-        if downloaded {
-            let since_last_eviction = self
-                .last_evicted_at
-                .lock()
-                .unwrap()
-                .take()
-                .map(|ts| ts.elapsed());
-            if let Some(since_last_eviction) = since_last_eviction {
-                // FIXME: this will not always be recorded correctly until #6028 (the no
-                // download needed branch above)
-                LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
-            }
-        }
+        // disable any scheduled but not yet running eviction deletions for this initialization
+        let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
+        self.status.as_ref().unwrap().send_replace(Status::Resident);
 
         let res = Arc::new(DownloadedLayer {
             owner: Arc::downgrade(self),
@@ -1012,15 +1123,9 @@ impl LayerInner {
             version: next_version,
         });
 
-        // FIXME: this might now be double-accounted for !downloaded
-        self.access_stats.record_residence_event(
-            LayerResidenceStatus::Resident,
-            LayerResidenceEventReason::ResidenceChange,
-        );
-
         let waiters = self.inner.initializer_count();
         if waiters > 0 {
-            tracing::info!(waiters, "completing the on-demand download for other tasks");
+            tracing::info!(waiters, "completing layer init for other tasks");
         }
 
         let value = ResidentOrWantedEvicted::Resident(res.clone());
@@ -1063,9 +1168,11 @@ impl LayerInner {
     fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
         let layer_file_name = self.desc.filename().file_name();
 
-        // this is not accurate: we could have the file locally but there was a cancellation
-        // and now we are not in sync, or we are currently downloading it.
-        let remote = self.inner.get().is_none();
+        let resident = self
+            .inner
+            .get()
+            .map(|rowe| rowe.is_likely_resident())
+            .unwrap_or(false);
 
         let access_stats = self.access_stats.as_api_model(reset);
 
@@ -1077,7 +1184,7 @@ impl LayerInner {
                 layer_file_size: self.desc.file_size,
                 lsn_start: lsn_range.start,
                 lsn_end: lsn_range.end,
-                remote,
+                remote: !resident,
                 access_stats,
             }
         } else {
@@ -1087,94 +1194,195 @@ impl LayerInner {
                 layer_file_name,
                 layer_file_size: self.desc.file_size,
                 lsn_start: lsn,
-                remote,
+                remote: !resident,
                 access_stats,
             }
         }
     }
 
     /// `DownloadedLayer` is being dropped, so it calls this method.
-    fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
-        let evict = self.wanted_evicted.load(Ordering::Acquire);
+    fn on_downloaded_layer_drop(self: Arc<LayerInner>, only_version: usize) {
         let can_evict = self.have_remote_client;
 
-        if can_evict && evict {
-            let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version);
+        // we cannot know without inspecting LayerInner::inner if we should evict or not, even
+        // though here it is very likely
+        let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, version=%only_version);
 
-            // downgrade for queueing, in case there's a tear down already ongoing we should not
-            // hold it alive.
-            let this = Arc::downgrade(&self);
-            drop(self);
-
-            // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
-            // drop while the `self.inner` is being locked, leading to a deadlock.
-
-            crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
-                let _g = span.entered();
-
-                // if LayerInner is already dropped here, do nothing because the delete on drop
-                // has already ran while we were in queue
-                let Some(this) = this.upgrade() else {
-                    LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
-                    return;
-                };
-                match this.evict_blocking(version) {
-                    Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
-                    Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
-                }
+        if !can_evict {
+            // it would be nice to assert this case out, but we are in drop
+            span.in_scope(|| {
+                tracing::error!("bug in struct Layer: ResidentOrWantedEvicted has been downgraded while we have no remote storage");
             });
+            return;
         }
+
+        // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
+        // drop while the `self.inner` is being locked, leading to a deadlock.
+
+        let start_evicting = async move {
+            #[cfg(test)]
+            self.failpoint(failpoints::FailpointKind::WaitBeforeStartingEvicting)
+                .await
+                .expect("failpoint should not have errored");
+
+            tracing::debug!("eviction started");
+
+            let res = self.wait_for_turn_and_evict(only_version).await;
+            // metrics: ignore the Ok branch, it is not done yet
+            if let Err(e) = res {
+                tracing::debug!(res=?Err::<(), _>(&e), "eviction completed");
+                LAYER_IMPL_METRICS.inc_eviction_cancelled(e);
+            }
+        };
+
+        Self::spawn(start_evicting.instrument(span));
     }
 
-    fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
-        // deleted or detached timeline, don't do anything.
-        let Some(timeline) = self.timeline.upgrade() else {
-            return Err(EvictionCancelled::TimelineGone);
-        };
+    async fn wait_for_turn_and_evict(
+        self: Arc<LayerInner>,
+        only_version: usize,
+    ) -> Result<(), EvictionCancelled> {
+        fn is_good_to_continue(status: &Status) -> Result<(), EvictionCancelled> {
+            use Status::*;
+            match status {
+                Resident => Ok(()),
+                Evicted => Err(EvictionCancelled::UnexpectedEvictedState),
+                Downloading => Err(EvictionCancelled::LostToDownload),
+            }
+        }
+
+        let timeline = self
+            .timeline
+            .upgrade()
+            .ok_or(EvictionCancelled::TimelineGone)?;
+
+        let mut rx = self
+            .status
+            .as_ref()
+            .expect("LayerInner cannot be dropped, holding strong ref")
+            .subscribe();
+
+        is_good_to_continue(&rx.borrow_and_update())?;
 
         let Ok(_gate) = timeline.gate.enter() else {
             return Err(EvictionCancelled::TimelineGone);
         };
 
-        // to avoid starting a new download while we evict, keep holding on to the
-        // permit.
-        let _permit = {
-            let maybe_downloaded = self.inner.get();
+        let permit = {
+            // we cannot just `std::fs::remove_file` because there might already be an
+            // get_or_maybe_download which will inspect filesystem and reinitialize. filesystem
+            // operations must be done while holding the heavier_once_cell::InitPermit
+            let mut wait = std::pin::pin!(self.inner.get_or_init_detached());
 
-            let (_weak, permit) = match maybe_downloaded {
-                Some(mut guard) => {
-                    if let ResidentOrWantedEvicted::WantedEvicted(_weak, version) = &*guard {
-                        if *version == only_version {
-                            guard.take_and_deinit()
-                        } else {
-                            // this was not for us; maybe there's another eviction job
-                            // TODO: does it make any sense to stall here? unique versions do not
-                            // matter, we only want to make sure not to evict a resident, which we
-                            // are not doing.
-                            return Err(EvictionCancelled::VersionCheckFailed);
-                        }
-                    } else {
-                        return Err(EvictionCancelled::AlreadyReinitialized);
+            let waited = loop {
+                // we must race to the Downloading starting, otherwise we would have to wait until the
+                // completion of the download. waiting for download could be long and hinder our
+                // efforts to alert on "hanging" evictions.
+                tokio::select! {
+                    res = &mut wait => break res,
+                    _ = rx.changed() => {
+                        is_good_to_continue(&rx.borrow_and_update())?;
+                        // two possibilities for Status::Resident:
+                        // - the layer was found locally from disk by a read
+                        // - we missed a bunch of updates and now the layer is
+                        // again downloaded -- assume we'll fail later on with
+                        // version check or AlreadyReinitialized
                     }
                 }
-                None => {
-                    // already deinitialized, perhaps get_or_maybe_download did this and is
-                    // currently waiting to reinitialize it
-                    return Err(EvictionCancelled::LostToDownload);
+            };
+
+            // re-check now that we have the guard or permit; all updates should have happened
+            // while holding the permit.
+            is_good_to_continue(&rx.borrow_and_update())?;
+
+            // the term deinitialize is used here, because we clearing out the Weak will eventually
+            // lead to deallocating the reference counted value, and the value we
+            // `Guard::take_and_deinit` is likely to be the last because the Weak is never cloned.
+            let (_weak, permit) = match waited {
+                Ok(guard) => {
+                    match &*guard {
+                        ResidentOrWantedEvicted::WantedEvicted(_weak, version)
+                            if *version == only_version =>
+                        {
+                            tracing::debug!(version, "deinitializing matching WantedEvicted");
+                            let (weak, permit) = guard.take_and_deinit();
+                            (Some(weak), permit)
+                        }
+                        ResidentOrWantedEvicted::WantedEvicted(_, version) => {
+                            // if we were not doing the version check, we would need to try to
+                            // upgrade the weak here to see if it really is dropped. version check
+                            // is done instead assuming that it is cheaper.
+                            tracing::debug!(
+                                version,
+                                only_version,
+                                "version mismatch, not deinitializing"
+                            );
+                            return Err(EvictionCancelled::VersionCheckFailed);
+                        }
+                        ResidentOrWantedEvicted::Resident(_) => {
+                            return Err(EvictionCancelled::AlreadyReinitialized);
+                        }
+                    }
+                }
+                Err(permit) => {
+                    tracing::debug!("continuing after cancelled get_or_maybe_download or eviction");
+                    (None, permit)
                 }
             };
 
             permit
         };
 
-        // now accesses to inner.get_or_init wait on the semaphore or the `_permit`
+        let span = tracing::Span::current();
 
-        self.access_stats.record_residence_event(
-            LayerResidenceStatus::Evicted,
-            LayerResidenceEventReason::ResidenceChange,
-        );
+        let spawned_at = std::time::Instant::now();
 
-        let res = match capture_mtime_and_remove(&self.path) {
+        // this is on purpose a detached spawn; we don't need to wait for it
+        //
+        // eviction completion reporting is the only thing hinging on this, and it can be just as
+        // well from a spawn_blocking thread.
+        //
+        // important to note that now that we've acquired the permit we have made sure the evicted
+        // file is either the exact `WantedEvicted` we wanted to evict, or uninitialized in case
+        // there are multiple evictions. The rest is not cancellable, and we've now commited to
+        // evicting.
+        //
+        // If spawn_blocking has a queue and maximum number of threads are in use, we could stall
+        // reads. We will need to add cancellation for that if necessary.
+        Self::spawn_blocking(move || {
+            let _span = span.entered();
+
+            let res = self.evict_blocking(&timeline, &permit);
+
+            let waiters = self.inner.initializer_count();
+
+            if waiters > 0 {
+                LAYER_IMPL_METRICS.inc_evicted_with_waiters();
+            }
+
+            let completed_in = spawned_at.elapsed();
+            LAYER_IMPL_METRICS.record_time_to_evict(completed_in);
+
+            match res {
+                Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
+                Err(e) => LAYER_IMPL_METRICS.inc_eviction_cancelled(e),
+            }
+
+            tracing::debug!(?res, elapsed_ms=%completed_in.as_millis(), %waiters, "eviction completed");
+        });
+
+        Ok(())
+    }
+
+    /// This is blocking only to do just one spawn_blocking hop compared to multiple via tokio::fs.
+    fn evict_blocking(
+        &self,
+        timeline: &Timeline,
+        _permit: &heavier_once_cell::InitPermit,
+    ) -> Result<(), EvictionCancelled> {
+        // now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit`
+
+        match capture_mtime_and_remove(&self.path) {
             Ok(local_layer_mtime) => {
                 let duration = SystemTime::now().duration_since(local_layer_mtime);
                 match duration {
@@ -1198,33 +1406,60 @@ impl LayerInner {
                 timeline
                     .metrics
                     .resident_physical_size_sub(self.desc.file_size);
-
-                Ok(())
             }
             Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
                 tracing::error!(
                     layer_size = %self.desc.file_size,
-                    "failed to evict layer from disk, it was already gone (metrics will be inaccurate)"
+                    "failed to evict layer from disk, it was already gone"
                 );
-                Err(EvictionCancelled::FileNotFound)
+                return Err(EvictionCancelled::FileNotFound);
             }
             Err(e) => {
+                // FIXME: this should probably be an abort
                 tracing::error!("failed to evict file from disk: {e:#}");
-                Err(EvictionCancelled::RemoveFailed)
+                return Err(EvictionCancelled::RemoveFailed);
             }
-        };
+        }
 
-        // we are still holding the permit, so no new spawn_download_and_wait can happen
-        drop(self.status.send(Status::Evicted));
+        self.access_stats.record_residence_event(
+            LayerResidenceStatus::Evicted,
+            LayerResidenceEventReason::ResidenceChange,
+        );
+
+        self.status.as_ref().unwrap().send_replace(Status::Evicted);
 
         *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
 
-        res
+        Ok(())
     }
 
     fn metadata(&self) -> LayerFileMetadata {
         LayerFileMetadata::new(self.desc.file_size, self.generation, self.shard)
     }
+
+    /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME.
+    ///
+    /// Synchronizing with spawned tasks is very complicated otherwise.
+    fn spawn<F>(fut: F)
+    where
+        F: std::future::Future<Output = ()> + Send + 'static,
+    {
+        #[cfg(test)]
+        tokio::task::spawn(fut);
+        #[cfg(not(test))]
+        crate::task_mgr::BACKGROUND_RUNTIME.spawn(fut);
+    }
+
+    /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME.
+    fn spawn_blocking<F>(f: F)
+    where
+        F: FnOnce() + Send + 'static,
+    {
+        #[cfg(test)]
+        tokio::task::spawn_blocking(f);
+        #[cfg(not(test))]
+        crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(f);
+    }
 }
 
 fn capture_mtime_and_remove(path: &Utf8Path) -> Result<SystemTime, std::io::Error> {
@@ -1268,8 +1503,10 @@ pub(crate) enum DownloadError {
     DownloadCancelled,
     #[error("pre-condition: stat before download failed")]
     PreStatFailed(#[source] std::io::Error),
-    #[error("post-condition: stat after download failed")]
-    PostStatFailed(#[source] std::io::Error),
+
+    #[cfg(test)]
+    #[error("failpoint: {0:?}")]
+    Failpoint(failpoints::FailpointKind),
 }
 
 #[derive(Debug, PartialEq)]
@@ -1316,6 +1553,7 @@ impl Drop for DownloadedLayer {
             owner.on_downloaded_layer_drop(self.version);
         } else {
             // no need to do anything, we are shutting down
+            LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
         }
     }
 }
@@ -1556,6 +1794,7 @@ pub(crate) struct LayerImplMetrics {
     rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
     inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
     redownload_after: metrics::Histogram,
+    time_to_evict: metrics::Histogram,
 }
 
 impl Default for LayerImplMetrics {
@@ -1651,6 +1890,13 @@ impl Default for LayerImplMetrics {
             .unwrap()
         };
 
+        let time_to_evict = metrics::register_histogram!(
+            "pageserver_layer_eviction_held_permit_seconds",
+            "Time eviction held the permit.",
+            vec![0.001, 0.010, 0.100, 0.500, 1.000, 5.000]
+        )
+        .unwrap();
+
         Self {
             started_evictions,
             completed_evictions,
@@ -1663,6 +1909,7 @@ impl Default for LayerImplMetrics {
             rare_counters,
             inits_cancelled,
             redownload_after,
+            time_to_evict,
         }
     }
 }
@@ -1694,6 +1941,12 @@ impl LayerImplMetrics {
         self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
     }
 
+    /// Expected rare just as cancellations are rare, but we could have cancellations separate from
+    /// the single caller which can start the download, so use this counter to separte them.
+    fn inc_init_completed_without_requester(&self) {
+        self.rare_counters[RareEvent::InitCompletedWithoutRequester].inc();
+    }
+
     /// Expected rare because cancellations are unexpected, and failures are unexpected
     fn inc_download_failed_without_requester(&self) {
         self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
@@ -1718,10 +1971,6 @@ impl LayerImplMetrics {
         self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
     }
 
-    fn inc_broadcast_lagged(&self) {
-        self.rare_counters[RareEvent::EvictAndWaitLagged].inc();
-    }
-
     fn inc_init_cancelled(&self) {
         self.inits_cancelled.inc()
     }
@@ -1729,9 +1978,22 @@ impl LayerImplMetrics {
     fn record_redownloaded_after(&self, duration: std::time::Duration) {
         self.redownload_after.observe(duration.as_secs_f64())
     }
+
+    /// This would be bad if it ever happened, or mean extreme disk pressure. We should probably
+    /// instead cancel eviction if we would have read waiters. We cannot however separate reads
+    /// from other evictions, so this could have noise as well.
+    fn inc_evicted_with_waiters(&self) {
+        self.rare_counters[RareEvent::EvictedWithWaiters].inc();
+    }
+
+    /// Recorded at least initially as the permit is now acquired in async context before
+    /// spawn_blocking action.
+    fn record_time_to_evict(&self, duration: std::time::Duration) {
+        self.time_to_evict.observe(duration.as_secs_f64())
+    }
 }
 
-#[derive(enum_map::Enum)]
+#[derive(Debug, Clone, Copy, enum_map::Enum)]
 enum EvictionCancelled {
     LayerGone,
     TimelineGone,
@@ -1743,6 +2005,7 @@ enum EvictionCancelled {
     LostToDownload,
     /// After eviction, there was a new layer access which cancelled the eviction.
     UpgradedBackOnAccess,
+    UnexpectedEvictedState,
 }
 
 impl EvictionCancelled {
@@ -1756,6 +2019,7 @@ impl EvictionCancelled {
             EvictionCancelled::AlreadyReinitialized => "already_reinitialized",
             EvictionCancelled::LostToDownload => "lost_to_download",
             EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access",
+            EvictionCancelled::UnexpectedEvictedState => "unexpected_evicted_state",
         }
     }
 }
@@ -1778,11 +2042,12 @@ impl DeleteFailed {
 #[derive(enum_map::Enum)]
 enum RareEvent {
     RemoveOnDropFailed,
+    InitCompletedWithoutRequester,
     DownloadFailedWithoutRequester,
     UpgradedWantedEvicted,
     InitWithoutDownload,
     PermanentLoadingFailure,
-    EvictAndWaitLagged,
+    EvictedWithWaiters,
 }
 
 impl RareEvent {
@@ -1791,11 +2056,12 @@ impl RareEvent {
 
         match self {
             RemoveOnDropFailed => "remove_on_drop_failed",
+            InitCompletedWithoutRequester => "init_completed_without",
             DownloadFailedWithoutRequester => "download_failed_without",
             UpgradedWantedEvicted => "raced_wanted_evicted",
             InitWithoutDownload => "init_needed_no_download",
             PermanentLoadingFailure => "permanent_loading_failure",
-            EvictAndWaitLagged => "broadcast_lagged",
+            EvictedWithWaiters => "evicted_with_waiters",
         }
     }
 }
diff --git a/pageserver/src/tenant/storage_layer/layer/failpoints.rs b/pageserver/src/tenant/storage_layer/layer/failpoints.rs
new file mode 100644
index 0000000000..6cedc41d98
--- /dev/null
+++ b/pageserver/src/tenant/storage_layer/layer/failpoints.rs
@@ -0,0 +1,119 @@
+//! failpoints for unit tests, implying `#[cfg(test)]`.
+//!
+//! These are not accessible over http.
+
+use super::*;
+
+impl Layer {
+    /// Enable a failpoint from a unit test.
+    pub(super) fn enable_failpoint(&self, failpoint: Failpoint) {
+        self.0.failpoints.lock().unwrap().push(failpoint);
+    }
+}
+
+impl LayerInner {
+    /// Query if this failpoint is enabled, as in, arrive at a failpoint.
+    ///
+    /// Calls to this method need to be `#[cfg(test)]` guarded.
+    pub(super) async fn failpoint(&self, kind: FailpointKind) -> Result<(), FailpointHit> {
+        let fut = {
+            let mut fps = self.failpoints.lock().unwrap();
+            // find the *last* failpoint for cases in which we need to use multiple for the same
+            // thing (two blocked evictions)
+            let fp = fps.iter_mut().rfind(|x| x.kind() == kind);
+
+            let Some(fp) = fp else {
+                return Ok(());
+            };
+
+            fp.hit()
+        };
+
+        fut.await
+    }
+}
+
+#[derive(Debug, PartialEq, Eq)]
+pub(crate) enum FailpointKind {
+    /// Failpoint acts as an accurate cancelled by drop here; see the only site of use.
+    AfterDeterminingLayerNeedsNoDownload,
+    /// Failpoint for stalling eviction starting
+    WaitBeforeStartingEvicting,
+    /// Failpoint hit in the spawned task
+    WaitBeforeDownloading,
+}
+
+pub(crate) enum Failpoint {
+    AfterDeterminingLayerNeedsNoDownload,
+    WaitBeforeStartingEvicting(
+        Option<utils::completion::Completion>,
+        utils::completion::Barrier,
+    ),
+    WaitBeforeDownloading(
+        Option<utils::completion::Completion>,
+        utils::completion::Barrier,
+    ),
+}
+
+impl Failpoint {
+    fn kind(&self) -> FailpointKind {
+        match self {
+            Failpoint::AfterDeterminingLayerNeedsNoDownload => {
+                FailpointKind::AfterDeterminingLayerNeedsNoDownload
+            }
+            Failpoint::WaitBeforeStartingEvicting(..) => FailpointKind::WaitBeforeStartingEvicting,
+            Failpoint::WaitBeforeDownloading(..) => FailpointKind::WaitBeforeDownloading,
+        }
+    }
+
+    fn hit(&mut self) -> impl std::future::Future<Output = Result<(), FailpointHit>> + 'static {
+        use futures::future::FutureExt;
+
+        // use boxed futures to avoid Either hurdles
+        match self {
+            Failpoint::AfterDeterminingLayerNeedsNoDownload => {
+                let kind = self.kind();
+
+                async move { Err(FailpointHit(kind)) }.boxed()
+            }
+            Failpoint::WaitBeforeStartingEvicting(arrival, b)
+            | Failpoint::WaitBeforeDownloading(arrival, b) => {
+                // first one signals arrival
+                drop(arrival.take());
+
+                let b = b.clone();
+
+                async move {
+                    tracing::trace!("waiting on a failpoint barrier");
+                    b.wait().await;
+                    tracing::trace!("done waiting on a failpoint barrier");
+                    Ok(())
+                }
+                .boxed()
+            }
+        }
+    }
+}
+
+impl std::fmt::Display for FailpointKind {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        std::fmt::Debug::fmt(self, f)
+    }
+}
+
+#[derive(Debug)]
+pub(crate) struct FailpointHit(FailpointKind);
+
+impl std::fmt::Display for FailpointHit {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        std::fmt::Debug::fmt(self, f)
+    }
+}
+
+impl std::error::Error for FailpointHit {}
+
+impl From<FailpointHit> for DownloadError {
+    fn from(value: FailpointHit) -> Self {
+        DownloadError::Failpoint(value.0)
+    }
+}
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index e7b2eb025a..247ff123b5 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1,14 +1,13 @@
-use futures::StreamExt;
 use pageserver_api::key::CONTROLFILE_KEY;
 use tokio::task::JoinSet;
-use tracing::Instrument;
 use utils::{
     completion::{self, Completion},
     id::TimelineId,
 };
 
+use super::failpoints::{Failpoint, FailpointKind};
 use super::*;
-use crate::{context::DownloadBehavior, task_mgr::BACKGROUND_RUNTIME};
+use crate::context::DownloadBehavior;
 use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
 
 /// Used in tests to advance a future to wanted await point, and not futher.
@@ -21,7 +20,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s
 /// Demonstrate the API and resident -> evicted -> resident -> deleted transitions.
 #[tokio::test]
 async fn smoke_test() {
-    let handle = BACKGROUND_RUNTIME.handle();
+    let handle = tokio::runtime::Handle::current();
 
     let h = TenantHarness::create("smoke_test").unwrap();
     let span = h.span();
@@ -38,7 +37,7 @@ async fn smoke_test() {
     let layer = {
         let mut layers = {
             let layers = timeline.layers.read().await;
-            layers.resident_layers().collect::<Vec<_>>().await
+            layers.likely_resident_layers().collect::<Vec<_>>()
         };
 
         assert_eq!(layers.len(), 1);
@@ -88,7 +87,7 @@ async fn smoke_test() {
     //
     // ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to
     // artificially slow it down.
-    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
+    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(&handle).await;
 
     match layer
         .evict_and_wait(std::time::Duration::ZERO)
@@ -99,7 +98,7 @@ async fn smoke_test() {
             // expected, but note that the eviction is "still ongoing"
             helper.release().await;
             // exhaust spawn_blocking pool to ensure it is now complete
-            SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle)
+            SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle)
                 .await;
         }
         other => unreachable!("{other:?}"),
@@ -108,7 +107,7 @@ async fn smoke_test() {
     // only way to query if a layer is resident is to acquire a ResidentLayer instance.
     // Layer::keep_resident never downloads, but it might initialize if the layer file is found
     // downloaded locally.
-    let none = layer.keep_resident().await.unwrap();
+    let none = layer.keep_resident().await;
     assert!(
         none.is_none(),
         "Expected none, because eviction removed the local file, found: {none:?}"
@@ -167,6 +166,7 @@ async fn smoke_test() {
     rtc.wait_completion().await.unwrap();
 
     assert_eq!(rtc.get_remote_physical_size(), 0);
+    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
 }
 
 /// This test demonstrates a previous hang when a eviction and deletion were requested at the same
@@ -174,7 +174,7 @@ async fn smoke_test() {
 #[tokio::test(start_paused = true)]
 async fn evict_and_wait_on_wanted_deleted() {
     // this is the runtime on which Layer spawns the blocking tasks on
-    let handle = BACKGROUND_RUNTIME.handle();
+    let handle = tokio::runtime::Handle::current();
 
     let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap();
     utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
@@ -188,7 +188,7 @@ async fn evict_and_wait_on_wanted_deleted() {
     let layer = {
         let mut layers = {
             let layers = timeline.layers.read().await;
-            layers.resident_layers().collect::<Vec<_>>().await
+            layers.likely_resident_layers().collect::<Vec<_>>()
         };
 
         assert_eq!(layers.len(), 1);
@@ -213,11 +213,11 @@ async fn evict_and_wait_on_wanted_deleted() {
         drop(resident);
 
         // make sure the eviction task gets to run
-        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
+        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
 
         let resident = layer.keep_resident().await;
         assert!(
-            matches!(resident, Ok(None)),
+            resident.is_none(),
             "keep_resident should not have re-initialized: {resident:?}"
         );
 
@@ -235,24 +235,408 @@ async fn evict_and_wait_on_wanted_deleted() {
         layers.finish_gc_timeline(&[layer]);
     }
 
-    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
+    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
 
     assert_eq!(1, LAYER_IMPL_METRICS.started_deletes.get());
     assert_eq!(1, LAYER_IMPL_METRICS.completed_deletes.get());
     assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
     assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
+    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
 }
 
-/// This test shows that ensures we are able to read the layer while the layer eviction has been
-/// started but not completed due to spawn_blocking pool being blocked.
-///
-/// Here `Layer::keep_resident` is used to "simulate" reads, because it cannot download.
-#[tokio::test(start_paused = true)]
-async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
-    // this is the runtime on which Layer spawns the blocking tasks on
-    let handle = BACKGROUND_RUNTIME.handle();
-    let h = TenantHarness::create("residency_check_while_evict_and_wait_on_clogged_spawn_blocking")
+/// This test ensures we are able to read the layer while the layer eviction has been
+/// started but not completed.
+#[test]
+fn read_wins_pending_eviction() {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .max_blocking_threads(1)
+        .enable_all()
+        .start_paused(true)
+        .build()
         .unwrap();
+
+    rt.block_on(async move {
+        // this is the runtime on which Layer spawns the blocking tasks on
+        let handle = tokio::runtime::Handle::current();
+        let h = TenantHarness::create("read_wins_pending_eviction").unwrap();
+        let (tenant, ctx) = h.load().await;
+        let span = h.span();
+        let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
+
+        let timeline = tenant
+            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+            .await
+            .unwrap();
+
+        let layer = {
+            let mut layers = {
+                let layers = timeline.layers.read().await;
+                layers.likely_resident_layers().collect::<Vec<_>>()
+            };
+
+            assert_eq!(layers.len(), 1);
+
+            layers.swap_remove(0)
+        };
+
+        // setup done
+
+        let resident = layer.keep_resident().await.unwrap();
+
+        let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
+
+        // drive the future to await on the status channel
+        tokio::time::timeout(ADVANCE, &mut evict_and_wait)
+            .await
+            .expect_err("should had been a timeout since we are holding the layer resident");
+        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+
+        let (completion, barrier) = utils::completion::channel();
+        let (arrival, arrived_at_barrier) = utils::completion::channel();
+        layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
+            Some(arrival),
+            barrier,
+        ));
+
+        // now the eviction cannot proceed because the threads are consumed while completion exists
+        drop(resident);
+        arrived_at_barrier.wait().await;
+        assert!(!layer.is_likely_resident());
+
+        // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
+        layer
+            .0
+            .get_or_maybe_download(false, None)
+            .instrument(download_span)
+            .await
+            .expect("should had reinitialized without downloading");
+
+        assert!(layer.is_likely_resident());
+
+        // reinitialization notifies of new resident status, which should error out all evict_and_wait
+        let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
+            .await
+            .expect("no timeout, because get_or_maybe_download re-initialized")
+            .expect_err("eviction should not have succeeded because re-initialized");
+
+        // works as intended: evictions lose to "downloads"
+        assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
+        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+
+        // this is not wrong: the eviction is technically still "on the way" as it's still queued
+        // because of a failpoint
+        assert_eq!(
+            0,
+            LAYER_IMPL_METRICS
+                .cancelled_evictions
+                .values()
+                .map(|ctr| ctr.get())
+                .sum::<u64>()
+        );
+
+        drop(completion);
+
+        tokio::time::sleep(ADVANCE).await;
+        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1)
+            .await;
+
+        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+
+        // now we finally can observe the original eviction failing
+        // it would had been possible to observe it earlier, but here it is guaranteed to have
+        // happened.
+        assert_eq!(
+            1,
+            LAYER_IMPL_METRICS
+                .cancelled_evictions
+                .values()
+                .map(|ctr| ctr.get())
+                .sum::<u64>()
+        );
+
+        assert_eq!(
+            1,
+            LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::AlreadyReinitialized].get()
+        );
+
+        assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
+    });
+}
+
+/// Use failpoint to delay an eviction starting to get a VersionCheckFailed.
+#[test]
+fn multiple_pending_evictions_in_order() {
+    let name = "multiple_pending_evictions_in_order";
+    let in_order = true;
+    multiple_pending_evictions_scenario(name, in_order);
+}
+
+/// Use failpoint to reorder later eviction before first to get a UnexpectedEvictedState.
+#[test]
+fn multiple_pending_evictions_out_of_order() {
+    let name = "multiple_pending_evictions_out_of_order";
+    let in_order = false;
+    multiple_pending_evictions_scenario(name, in_order);
+}
+
+fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .max_blocking_threads(1)
+        .enable_all()
+        .start_paused(true)
+        .build()
+        .unwrap();
+
+    rt.block_on(async move {
+        // this is the runtime on which Layer spawns the blocking tasks on
+        let handle = tokio::runtime::Handle::current();
+        let h = TenantHarness::create(name).unwrap();
+        let (tenant, ctx) = h.load().await;
+        let span = h.span();
+        let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
+
+        let timeline = tenant
+            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+            .await
+            .unwrap();
+
+        let layer = {
+            let mut layers = {
+                let layers = timeline.layers.read().await;
+                layers.likely_resident_layers().collect::<Vec<_>>()
+            };
+
+            assert_eq!(layers.len(), 1);
+
+            layers.swap_remove(0)
+        };
+
+        // setup done
+
+        let resident = layer.keep_resident().await.unwrap();
+
+        let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
+
+        // drive the future to await on the status channel
+        tokio::time::timeout(ADVANCE, &mut evict_and_wait)
+            .await
+            .expect_err("should had been a timeout since we are holding the layer resident");
+        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+
+        let (completion1, barrier) = utils::completion::channel();
+        let mut completion1 = Some(completion1);
+        let (arrival, arrived_at_barrier) = utils::completion::channel();
+        layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
+            Some(arrival),
+            barrier,
+        ));
+
+        // now the eviction cannot proceed because we are simulating arbitrary long delay for the
+        // eviction task start.
+        drop(resident);
+        assert!(!layer.is_likely_resident());
+
+        arrived_at_barrier.wait().await;
+
+        // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
+        layer
+            .0
+            .get_or_maybe_download(false, None)
+            .instrument(download_span)
+            .await
+            .expect("should had reinitialized without downloading");
+
+        assert!(layer.is_likely_resident());
+
+        // reinitialization notifies of new resident status, which should error out all evict_and_wait
+        let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
+            .await
+            .expect("no timeout, because get_or_maybe_download re-initialized")
+            .expect_err("eviction should not have succeeded because re-initialized");
+
+        // works as intended: evictions lose to "downloads"
+        assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
+        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+
+        // this is not wrong: the eviction is technically still "on the way" as it's still queued
+        // because of a failpoint
+        assert_eq!(
+            0,
+            LAYER_IMPL_METRICS
+                .cancelled_evictions
+                .values()
+                .map(|ctr| ctr.get())
+                .sum::<u64>()
+        );
+
+        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+
+        // configure another failpoint for the second eviction -- evictions are per initialization,
+        // so now that we've reinitialized the inner, we get to run two of them at the same time.
+        let (completion2, barrier) = utils::completion::channel();
+        let (arrival, arrived_at_barrier) = utils::completion::channel();
+        layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
+            Some(arrival),
+            barrier,
+        ));
+
+        let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
+
+        // advance to the wait on the queue
+        tokio::time::timeout(ADVANCE, &mut second_eviction)
+            .await
+            .expect_err("timeout because failpoint is blocking");
+
+        arrived_at_barrier.wait().await;
+
+        assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());
+
+        let mut release_earlier_eviction = |expected_reason| {
+            assert_eq!(
+                0,
+                LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
+            );
+
+            drop(completion1.take().unwrap());
+
+            let handle = &handle;
+
+            async move {
+                tokio::time::sleep(ADVANCE).await;
+                SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(
+                    handle, 1,
+                )
+                .await;
+
+                assert_eq!(
+                    1,
+                    LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
+                );
+            }
+        };
+
+        if in_order {
+            release_earlier_eviction(EvictionCancelled::VersionCheckFailed).await;
+        }
+
+        // release the later eviction which is for the current version
+        drop(completion2);
+        tokio::time::sleep(ADVANCE).await;
+        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1)
+            .await;
+
+        if !in_order {
+            release_earlier_eviction(EvictionCancelled::UnexpectedEvictedState).await;
+        }
+
+        tokio::time::timeout(ADVANCE, &mut second_eviction)
+            .await
+            .expect("eviction goes through now that spawn_blocking is unclogged")
+            .expect("eviction should succeed, because version matches");
+
+        assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
+
+        // ensure the cancelled are unchanged
+        assert_eq!(
+            1,
+            LAYER_IMPL_METRICS
+                .cancelled_evictions
+                .values()
+                .map(|ctr| ctr.get())
+                .sum::<u64>()
+        );
+
+        assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
+    });
+}
+
+/// The test ensures with a failpoint that a pending eviction is not cancelled by what is currently
+/// a `Layer::keep_resident` call.
+///
+/// This matters because cancelling the eviction would leave us in a state where the file is on
+/// disk but the layer internal state says it has not been initialized. Futhermore, it allows us to
+/// have non-repairing `Layer::is_likely_resident`.
+#[tokio::test(start_paused = true)]
+async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
+    let handle = tokio::runtime::Handle::current();
+    let h =
+        TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap();
+    let (tenant, ctx) = h.load().await;
+
+    let timeline = tenant
+        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .await
+        .unwrap();
+
+    let layer = {
+        let mut layers = {
+            let layers = timeline.layers.read().await;
+            layers.likely_resident_layers().collect::<Vec<_>>()
+        };
+
+        assert_eq!(layers.len(), 1);
+
+        layers.swap_remove(0)
+    };
+
+    // this failpoint will simulate the `get_or_maybe_download` becoming cancelled (by returning an
+    // Err) at the right time as in "during" the `LayerInner::needs_download`.
+    layer.enable_failpoint(Failpoint::AfterDeterminingLayerNeedsNoDownload);
+
+    let (completion, barrier) = utils::completion::channel();
+    let (arrival, arrived_at_barrier) = utils::completion::channel();
+
+    layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
+        Some(arrival),
+        barrier,
+    ));
+
+    tokio::time::timeout(ADVANCE, layer.evict_and_wait(FOREVER))
+        .await
+        .expect_err("should had advanced to waiting on channel");
+
+    arrived_at_barrier.wait().await;
+
+    // simulate a cancelled read which is cancelled before it gets to re-initialize
+    let e = layer
+        .0
+        .get_or_maybe_download(false, None)
+        .await
+        .unwrap_err();
+    assert!(
+        matches!(
+            e,
+            DownloadError::Failpoint(FailpointKind::AfterDeterminingLayerNeedsNoDownload)
+        ),
+        "{e:?}"
+    );
+
+    assert!(
+        layer.0.needs_download().await.unwrap().is_none(),
+        "file is still on disk"
+    );
+
+    // release the eviction task
+    drop(completion);
+    tokio::time::sleep(ADVANCE).await;
+    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
+
+    // failpoint is still enabled, but it is not hit
+    let e = layer
+        .0
+        .get_or_maybe_download(false, None)
+        .await
+        .unwrap_err();
+    assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}");
+
+    // failpoint is not counted as cancellation either
+    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
+}
+
+#[tokio::test(start_paused = true)]
+async fn evict_and_wait_does_not_wait_for_download() {
+    // let handle = tokio::runtime::Handle::current();
+    let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap();
     let (tenant, ctx) = h.load().await;
     let span = h.span();
     let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -265,7 +649,7 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
     let layer = {
         let mut layers = {
             let layers = timeline.layers.read().await;
-            layers.resident_layers().collect::<Vec<_>>().await
+            layers.likely_resident_layers().collect::<Vec<_>>()
         };
 
         assert_eq!(layers.len(), 1);
@@ -273,91 +657,76 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
         layers.swap_remove(0)
     };
 
-    // setup done
-
-    let resident = layer.keep_resident().await.unwrap();
+    // kind of forced setup: start an eviction but do not allow it progress until we are
+    // downloading
+    let (eviction_can_continue, barrier) = utils::completion::channel();
+    let (arrival, eviction_arrived) = utils::completion::channel();
+    layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
+        Some(arrival),
+        barrier,
+    ));
 
     let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
 
-    // drive the future to await on the status channel
+    // use this once-awaited other_evict to synchronize with the eviction
+    let other_evict = layer.evict_and_wait(FOREVER);
+
     tokio::time::timeout(ADVANCE, &mut evict_and_wait)
         .await
-        .expect_err("should had been a timeout since we are holding the layer resident");
-    assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+        .expect_err("should had advanced");
+    eviction_arrived.wait().await;
+    drop(eviction_can_continue);
+    other_evict.await.unwrap();
 
-    // clog up BACKGROUND_RUNTIME spawn_blocking
-    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
+    // now the layer is evicted, and the "evict_and_wait" is waiting on the receiver
+    assert!(!layer.is_likely_resident());
 
-    // now the eviction cannot proceed because the threads are consumed while completion exists
-    drop(resident);
+    // following new evict_and_wait will fail until we've completed the download
+    let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
+    assert!(matches!(e, EvictionError::NotFound), "{e:?}");
 
-    // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
-    layer
-        .keep_resident()
-        .instrument(download_span)
-        .await
-        .expect("keep_resident should had reinitialized without downloading")
-        .expect("ResidentLayer");
+    let (download_can_continue, barrier) = utils::completion::channel();
+    let (arrival, _download_arrived) = utils::completion::channel();
+    layer.enable_failpoint(Failpoint::WaitBeforeDownloading(Some(arrival), barrier));
 
-    // because the keep_resident check alters wanted evicted without sending a message, we will
-    // never get completed
-    let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
-        .await
-        .expect("no timeout, because keep_resident re-initialized")
-        .expect_err("eviction should not have succeeded because re-initialized");
+    let mut download = std::pin::pin!(layer
+        .0
+        .get_or_maybe_download(true, None)
+        .instrument(download_span));
 
-    // works as intended: evictions lose to "downloads"
-    assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
-    assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
-
-    // this is not wrong: the eviction is technically still "on the way" as it's still queued
-    // because spawn_blocking is clogged up
-    assert_eq!(
-        0,
-        LAYER_IMPL_METRICS
-            .cancelled_evictions
-            .values()
-            .map(|ctr| ctr.get())
-            .sum::<u64>()
+    assert!(
+        !layer.is_likely_resident(),
+        "during download layer is evicted"
     );
 
-    let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
-
-    // advance to the wait on the queue
-    tokio::time::timeout(ADVANCE, &mut second_eviction)
+    tokio::time::timeout(ADVANCE, &mut download)
         .await
-        .expect_err("timeout because spawn_blocking is clogged");
+        .expect_err("should had timed out because of failpoint");
 
-    // in this case we don't leak started evictions, but I think there is still a chance of that
-    // happening, because we could have upgrades race multiple evictions while only one of them
-    // happens?
-    assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());
+    // now we finally get to continue, and because the latest state is downloading, we deduce that
+    // original eviction succeeded
+    evict_and_wait.await.unwrap();
 
-    helper.release().await;
+    // however a new evict_and_wait will fail
+    let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
+    assert!(matches!(e, EvictionError::NotFound), "{e:?}");
 
-    // the second_eviction gets to run here
-    //
-    // synchronize to be *strictly* after the second_eviction spawn_blocking run
-    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
+    assert!(!layer.is_likely_resident());
 
-    tokio::time::timeout(ADVANCE, &mut second_eviction)
-        .await
-        .expect("eviction goes through now that spawn_blocking is unclogged")
-        .expect("eviction should succeed, because version matches");
+    drop(download_can_continue);
+    download.await.expect("download should had succeeded");
+    assert!(layer.is_likely_resident());
 
-    assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
+    // only now can we evict
+    layer.evict_and_wait(FOREVER).await.unwrap();
+}
 
-    // now we finally can observe the original spawn_blocking failing
-    // it would had been possible to observe it earlier, but here it is guaranteed to have
-    // happened.
-    assert_eq!(
-        1,
-        LAYER_IMPL_METRICS
-            .cancelled_evictions
-            .values()
-            .map(|ctr| ctr.get())
-            .sum::<u64>()
-    );
+#[test]
+fn layer_size() {
+    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
+    assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
+    assert_eq!(std::mem::size_of::<LayerInner>(), 2328);
+    // it also has the utf8 path
 }
 
 struct SpawnBlockingPoolHelper {
@@ -374,31 +743,41 @@ impl SpawnBlockingPoolHelper {
     ///
     /// This should be no issue nowdays, because nextest runs each test in it's own process.
     async fn consume_all_spawn_blocking_threads(handle: &tokio::runtime::Handle) -> Self {
-        let (completion, barrier) = completion::channel();
-        let (tx, mut rx) = tokio::sync::mpsc::channel(8);
+        let default_max_blocking_threads = 512;
 
-        let assumed_max_blocking_threads = 512;
+        Self::consume_all_spawn_blocking_threads0(handle, default_max_blocking_threads).await
+    }
+
+    async fn consume_all_spawn_blocking_threads0(
+        handle: &tokio::runtime::Handle,
+        threads: usize,
+    ) -> Self {
+        assert_ne!(threads, 0);
+
+        let (completion, barrier) = completion::channel();
+        let (started, starts_completed) = completion::channel();
 
         let mut blocking_tasks = JoinSet::new();
 
-        for _ in 0..assumed_max_blocking_threads {
+        for _ in 0..threads {
             let barrier = barrier.clone();
-            let tx = tx.clone();
+            let started = started.clone();
             blocking_tasks.spawn_blocking_on(
                 move || {
-                    tx.blocking_send(()).unwrap();
-                    drop(tx);
+                    drop(started);
                     tokio::runtime::Handle::current().block_on(barrier.wait());
                 },
                 handle,
             );
         }
 
+        drop(started);
+
+        starts_completed.wait().await;
+
         drop(barrier);
 
-        for _ in 0..assumed_max_blocking_threads {
-            rx.recv().await.unwrap();
-        }
+        tracing::trace!("consumed all threads");
 
         SpawnBlockingPoolHelper {
             awaited_by_spawn_blocking_tasks: completion,
@@ -418,13 +797,22 @@ impl SpawnBlockingPoolHelper {
         while let Some(res) = blocking_tasks.join_next().await {
             res.expect("none of the tasks should had panicked");
         }
+
+        tracing::trace!("released all threads");
     }
 
     /// In the tests it is used as an easy way of making sure something scheduled on the target
     /// runtimes `spawn_blocking` has completed, because it must've been scheduled and completed
     /// before our tasks have a chance to schedule and complete.
     async fn consume_and_release_all_of_spawn_blocking_threads(handle: &tokio::runtime::Handle) {
-        Self::consume_all_spawn_blocking_threads(handle)
+        Self::consume_and_release_all_of_spawn_blocking_threads0(handle, 512).await
+    }
+
+    async fn consume_and_release_all_of_spawn_blocking_threads0(
+        handle: &tokio::runtime::Handle,
+        threads: usize,
+    ) {
+        Self::consume_all_spawn_blocking_threads0(handle, threads)
             .await
             .release()
             .await
@@ -438,7 +826,7 @@ fn spawn_blocking_pool_helper_actually_works() {
     // because the amount is not configurable for our helper, expect the same amount as
     // BACKGROUND_RUNTIME using the tokio defaults would have.
     let rt = tokio::runtime::Builder::new_current_thread()
-        .max_blocking_threads(512)
+        .max_blocking_threads(1)
         .enable_all()
         .build()
         .unwrap();
@@ -448,7 +836,8 @@ fn spawn_blocking_pool_helper_actually_works() {
     rt.block_on(async move {
         // this will not return until all threads are spun up and actually executing the code
         // waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d.
-        let consumed = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
+        let consumed =
+            SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads0(handle, 1).await;
 
         println!("consumed");
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2ab7301cce..7523130f23 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -13,7 +13,6 @@ use bytes::Bytes;
 use camino::Utf8Path;
 use enumset::EnumSet;
 use fail::fail_point;
-use futures::stream::StreamExt;
 use once_cell::sync::Lazy;
 use pageserver_api::{
     key::AUX_FILES_KEY,
@@ -37,6 +36,7 @@ use tracing::*;
 use utils::{
     bin_ser::BeSer,
     sync::gate::{Gate, GateGuard},
+    vec_map::VecMap,
 };
 
 use std::ops::{Deref, Range};
@@ -2442,7 +2442,7 @@ impl Timeline {
 
         let guard = self.layers.read().await;
 
-        let resident = guard.resident_layers().map(|layer| {
+        let resident = guard.likely_resident_layers().map(|layer| {
             let last_activity_ts = layer.access_stats().latest_activity_or_now();
 
             HeatMapLayer::new(
@@ -2452,7 +2452,7 @@ impl Timeline {
             )
         });
 
-        let layers = resident.collect().await;
+        let layers = resident.collect();
 
         Some(HeatMapTimeline::new(self.timeline_id, layers))
     }
@@ -4302,7 +4302,7 @@ impl Timeline {
         let mut max_layer_size: Option<u64> = None;
 
         let resident_layers = guard
-            .resident_layers()
+            .likely_resident_layers()
             .map(|layer| {
                 let file_size = layer.layer_desc().file_size;
                 max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
@@ -4315,8 +4315,7 @@ impl Timeline {
                     relative_last_activity: finite_f32::FiniteF32::ZERO,
                 }
             })
-            .collect()
-            .await;
+            .collect();
 
         DiskUsageEvictionInfo {
             max_layer_size,
@@ -4618,16 +4617,15 @@ impl<'a> TimelineWriter<'a> {
         }
     }
 
-    /// Put a batch keys at the specified Lsns.
+    /// Put a batch of keys at the specified Lsns.
     ///
-    /// The batch should be sorted by Lsn such that it's safe
-    /// to roll the open layer mid batch.
+    /// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`].
     pub(crate) async fn put_batch(
         &mut self,
-        batch: Vec<(Key, Lsn, Value)>,
+        batch: VecMap<Lsn, (Key, Value)>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        for (key, lsn, val) in batch {
+        for (lsn, (key, val)) in batch {
             self.put(key, lsn, &val, ctx).await?
         }
 
@@ -4713,7 +4711,6 @@ mod tests {
             .keep_resident()
             .await
             .expect("no download => no downloading errors")
-            .expect("should had been resident")
             .drop_eviction_guard();
 
         let forever = std::time::Duration::from_secs(120);
@@ -4724,7 +4721,7 @@ mod tests {
         let (first, second) = tokio::join!(first, second);
 
         let res = layer.keep_resident().await;
-        assert!(matches!(res, Ok(None)), "{res:?}");
+        assert!(res.is_none(), "{res:?}");
 
         match (first, second) {
             (Ok(()), Ok(())) => {
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index dd603135d2..dd769d4121 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -225,24 +225,18 @@ impl Timeline {
         {
             let guard = self.layers.read().await;
             let layers = guard.layer_map();
-            for hist_layer in layers.iter_historic_layers() {
-                let hist_layer = guard.get_from_desc(&hist_layer);
+            for layer in layers.iter_historic_layers() {
+                let layer = guard.get_from_desc(&layer);
 
                 // guard against eviction while we inspect it; it might be that eviction_task and
                 // disk_usage_eviction_task both select the same layers to be evicted, and
                 // seemingly free up double the space. both succeeding is of no consequence.
-                let guard = match hist_layer.keep_resident().await {
-                    Ok(Some(l)) => l,
-                    Ok(None) => continue,
-                    Err(e) => {
-                        // these should not happen, but we cannot make them statically impossible right
-                        // now.
-                        tracing::warn!(layer=%hist_layer, "failed to keep the layer resident: {e:#}");
-                        continue;
-                    }
-                };
 
-                let last_activity_ts = hist_layer.access_stats().latest_activity_or_now();
+                if !layer.is_likely_resident() {
+                    continue;
+                }
+
+                let last_activity_ts = layer.access_stats().latest_activity_or_now();
 
                 let no_activity_for = match now.duration_since(last_activity_ts) {
                     Ok(d) => d,
@@ -265,9 +259,8 @@ impl Timeline {
                         continue;
                     }
                 };
-                let layer = guard.drop_eviction_guard();
+
                 if no_activity_for > p.threshold {
-                    // this could cause a lot of allocations in some cases
                     js.spawn(async move {
                         layer
                             .evict_and_wait(std::time::Duration::from_secs(5))
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index ebcdcfdb4d..d54dc1642c 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -1,5 +1,4 @@
 use anyhow::{bail, ensure, Context, Result};
-use futures::StreamExt;
 use pageserver_api::shard::TenantShardId;
 use std::{collections::HashMap, sync::Arc};
 use tracing::trace;
@@ -241,29 +240,16 @@ impl LayerManager {
         layer.delete_on_drop();
     }
 
-    pub(crate) fn resident_layers(&self) -> impl futures::stream::Stream<Item = Layer> + '_ {
+    pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = Layer> + '_ {
         // for small layer maps, we most likely have all resident, but for larger more are likely
         // to be evicted assuming lots of layers correlated with longer lifespan.
 
-        let layers = self
-            .layer_map()
-            .iter_historic_layers()
-            .map(|desc| self.get_from_desc(&desc));
-
-        let layers = futures::stream::iter(layers);
-
-        layers.filter_map(|layer| async move {
-            // TODO(#6028): this query does not really need to see the ResidentLayer
-            match layer.keep_resident().await {
-                Ok(Some(layer)) => Some(layer.drop_eviction_guard()),
-                Ok(None) => None,
-                Err(e) => {
-                    // these should not happen, but we cannot make them statically impossible right
-                    // now.
-                    tracing::warn!(%layer, "failed to keep the layer resident: {e:#}");
-                    None
-                }
-            }
+        self.layer_map().iter_historic_layers().filter_map(|desc| {
+            self.layer_fmgr
+                .0
+                .get(&desc.key())
+                .filter(|l| l.is_likely_resident())
+                .cloned()
         })
     }
 
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index c46fd9b3ec..002bf4e2ce 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -549,9 +549,10 @@ walprop_pg_init_standalone_sync_safekeepers(void)
 static void
 walprop_sigusr2(SIGNAL_ARGS)
 {
+	int			save_errno = errno;
 	got_SIGUSR2 = true;
-
 	SetLatch(MyLatch);
+	errno = save_errno;
 }
 
 static void
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index b3a5bf873e..93a1fe85db 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -63,6 +63,7 @@ sha2 = { workspace = true, features = ["asm"] }
 smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
+subtle.workspace = true
 sync_wrapper.workspace = true
 task-local-extensions.workspace = true
 thiserror.workspace = true
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index f26dcb7c9a..45bbad8cb2 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -194,14 +194,7 @@ pub(crate) async fn validate_password_and_exchange(
         }
         // perform scram authentication as both client and server to validate the keys
         AuthSecret::Scram(scram_secret) => {
-            use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
-            let sasl_client = ScramSha256::new(password, ChannelBinding::unsupported());
-            let outcome = crate::scram::exchange(
-                &scram_secret,
-                sasl_client,
-                crate::config::TlsServerEndPoint::Undefined,
-            )
-            .await?;
+            let outcome = crate::scram::exchange(&scram_secret, password).await?;
 
             let client_key = match outcome {
                 sasl::Outcome::Success(client_key) => client_key,
diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs
index 6186ddde0d..cbb17ebcb7 100644
--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
@@ -2,14 +2,21 @@ use anyhow::{anyhow, bail};
 use hyper::{Body, Request, Response, StatusCode};
 use std::{convert::Infallible, net::TcpListener};
 use tracing::info;
-use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService};
+use utils::http::{
+    endpoint::{self, prometheus_metrics_handler, request_span},
+    error::ApiError,
+    json::json_response,
+    RouterBuilder, RouterService,
+};
 
 async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(StatusCode::OK, "")
 }
 
 fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
-    endpoint::make_router().get("/v1/status", status_handler)
+    endpoint::make_router()
+        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .get("/v1/status", status_handler)
 }
 
 pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible> {
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 5d0340e852..9c3be73612 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -135,9 +135,10 @@ impl TestAuth for NoAuth {}
 struct Scram(scram::ServerSecret);
 
 impl Scram {
-    fn new(password: &str) -> anyhow::Result<Self> {
-        let secret =
-            scram::ServerSecret::build(password).context("failed to generate scram secret")?;
+    async fn new(password: &str) -> anyhow::Result<Self> {
+        let secret = scram::ServerSecret::build(password)
+            .await
+            .context("failed to generate scram secret")?;
         Ok(Scram(secret))
     }
 
@@ -284,7 +285,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
     let proxy = tokio::spawn(dummy_proxy(
         client,
         Some(server_config),
-        Scram::new(password)?,
+        Scram::new(password).await?,
     ));
 
     let (_client, _conn) = tokio_postgres::Config::new()
@@ -308,7 +309,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
     let proxy = tokio::spawn(dummy_proxy(
         client,
         Some(server_config),
-        Scram::new("password")?,
+        Scram::new("password").await?,
     ));
 
     let (_client, _conn) = tokio_postgres::Config::new()
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index e0c2d836f4..3b760e5dab 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -148,7 +148,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
     let proxy = tokio::spawn(dummy_proxy(
         client,
         Some(server_config),
-        Scram::new("password")?,
+        Scram::new("password").await?,
     ));
 
     let _client_err = tokio_postgres::Config::new()
@@ -231,7 +231,7 @@ async fn connect_failure(
     let proxy = tokio::spawn(dummy_proxy(
         client,
         Some(server_config),
-        Scram::new("password")?,
+        Scram::new("password").await?,
     ));
 
     let _client_err = tokio_postgres::Config::new()
diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs
index 1cf8b53e11..0811416ca2 100644
--- a/proxy/src/sasl.rs
+++ b/proxy/src/sasl.rs
@@ -33,6 +33,9 @@ pub enum Error {
     #[error("Internal error: missing digest")]
     MissingBinding,
 
+    #[error("could not decode salt: {0}")]
+    Base64(#[from] base64::DecodeError),
+
     #[error(transparent)]
     Io(#[from] io::Error),
 }
@@ -55,6 +58,7 @@ impl ReportableError for Error {
             Error::ChannelBindingBadMethod(_) => crate::error::ErrorKind::User,
             Error::BadClientMessage(_) => crate::error::ErrorKind::User,
             Error::MissingBinding => crate::error::ErrorKind::Service,
+            Error::Base64(_) => crate::error::ErrorKind::ControlPlane,
             Error::Io(_) => crate::error::ErrorKind::ClientDisconnect,
         }
     }
diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs
index df4b3ec8d7..ed80675f8a 100644
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -56,8 +56,6 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
 
 #[cfg(test)]
 mod tests {
-    use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
-
     use crate::sasl::{Mechanism, Step};
 
     use super::{Exchange, ServerSecret};
@@ -114,17 +112,10 @@ mod tests {
     }
 
     async fn run_round_trip_test(server_password: &str, client_password: &str) {
-        let scram_secret = ServerSecret::build(server_password).unwrap();
-        let sasl_client =
-            ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported());
-
-        let outcome = super::exchange(
-            &scram_secret,
-            sasl_client,
-            crate::config::TlsServerEndPoint::Undefined,
-        )
-        .await
-        .unwrap();
+        let scram_secret = ServerSecret::build(server_password).await.unwrap();
+        let outcome = super::exchange(&scram_secret, client_password.as_bytes())
+            .await
+            .unwrap();
 
         match outcome {
             crate::sasl::Outcome::Success(_) => {}
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index 16575d5d98..682cbe795f 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -2,7 +2,11 @@
 
 use std::convert::Infallible;
 
-use postgres_protocol::authentication::sasl::ScramSha256;
+use hmac::{Hmac, Mac};
+use sha2::digest::FixedOutput;
+use sha2::{Digest, Sha256};
+use subtle::{Choice, ConstantTimeEq};
+use tokio::task::yield_now;
 
 use super::messages::{
     ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN,
@@ -71,47 +75,71 @@ impl<'a> Exchange<'a> {
     }
 }
 
+// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
+async fn pbkdf2(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] {
+    let hmac = Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
+    let mut prev = hmac
+        .clone()
+        .chain_update(salt)
+        .chain_update(1u32.to_be_bytes())
+        .finalize()
+        .into_bytes();
+
+    let mut hi = prev;
+
+    for i in 1..iterations {
+        prev = hmac.clone().chain_update(prev).finalize().into_bytes();
+
+        for (hi, prev) in hi.iter_mut().zip(prev) {
+            *hi ^= prev;
+        }
+        // yield every ~250us
+        // hopefully reduces tail latencies
+        if i % 1024 == 0 {
+            yield_now().await
+        }
+    }
+
+    hi.into()
+}
+
+// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L236-L248>
+async fn derive_keys(password: &[u8], salt: &[u8], iterations: u32) -> ([u8; 32], [u8; 32]) {
+    let salted_password = pbkdf2(password, salt, iterations).await;
+
+    let make_key = |name| {
+        let key = Hmac::<Sha256>::new_from_slice(&salted_password)
+            .expect("HMAC is able to accept all key sizes")
+            .chain_update(name)
+            .finalize();
+
+        <[u8; 32]>::from(key.into_bytes())
+    };
+
+    (make_key(b"Client Key"), make_key(b"Server Key"))
+}
+
 pub async fn exchange(
     secret: &ServerSecret,
-    mut client: ScramSha256,
-    tls_server_end_point: config::TlsServerEndPoint,
+    password: &[u8],
 ) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
-    use sasl::Step::*;
+    let salt = base64::decode(&secret.salt_base64)?;
+    let (client_key, server_key) = derive_keys(password, &salt, secret.iterations).await;
+    let stored_key: [u8; 32] = Sha256::default()
+        .chain_update(client_key)
+        .finalize_fixed()
+        .into();
 
-    let init = SaslInitial {
-        nonce: rand::random,
-    };
+    // constant time to not leak partial key match
+    let valid = stored_key.ct_eq(&secret.stored_key.as_bytes())
+        | server_key.ct_eq(&secret.server_key.as_bytes())
+        | Choice::from(secret.doomed as u8);
 
-    let client_first = std::str::from_utf8(client.message())
-        .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
-    let sent = match init.transition(secret, &tls_server_end_point, client_first)? {
-        Continue(sent, server_first) => {
-            // `client.update` might perform `pbkdf2(pw)`, best to spawn it in a blocking thread.
-            // TODO(conrad): take this code from tokio-postgres and make an async-aware pbkdf2 impl
-            client = tokio::task::spawn_blocking(move || {
-                client.update(server_first.as_bytes())?;
-                Ok::<ScramSha256, std::io::Error>(client)
-            })
-            .await
-            .expect("should not panic while performing password hash")?;
-            sent
-        }
-        Success(x, _) => match x {},
-        Failure(msg) => return Ok(sasl::Outcome::Failure(msg)),
-    };
-
-    let client_final = std::str::from_utf8(client.message())
-        .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
-    let keys = match sent.transition(secret, &tls_server_end_point, client_final)? {
-        Success(keys, server_final) => {
-            client.finish(server_final.as_bytes())?;
-            keys
-        }
-        Continue(x, _) => match x {},
-        Failure(msg) => return Ok(sasl::Outcome::Failure(msg)),
-    };
-
-    Ok(sasl::Outcome::Success(keys))
+    if valid.into() {
+        Ok(sasl::Outcome::Success(super::ScramKey::from(client_key)))
+    } else {
+        Ok(sasl::Outcome::Failure("password doesn't match"))
+    }
 }
 
 impl SaslInitial {
diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs
index fb3c45816e..b46d8c3ab5 100644
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -59,10 +59,8 @@ impl ServerSecret {
     /// Build a new server secret from the prerequisites.
     /// XXX: We only use this function in tests.
     #[cfg(test)]
-    pub fn build(password: &str) -> Option<Self> {
-        Self::parse(&postgres_protocol::password::scram_sha_256(
-            password.as_bytes(),
-        ))
+    pub async fn build(password: &str) -> Option<Self> {
+        Self::parse(&postgres_protocol::password::scram_sha_256(password.as_bytes()).await)
     }
 }
 
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index a0c0c7ca4c..9ce26e6c5d 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -20,7 +20,7 @@ use std::io::Write as _;
 use tokio::sync::mpsc;
 use tokio_stream::wrappers::ReceiverStream;
 use tracing::{info_span, Instrument};
-use utils::http::endpoint::{request_span, ChannelWriter};
+use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWriter};
 
 use crate::debug_dump::TimelineDigestRequest;
 use crate::receive_wal::WalReceiverState;
@@ -515,6 +515,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
     router
         .data(Arc::new(conf))
         .data(auth)
+        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
         .get("/v1/status", |r| request_span(r, status_handler))
         .put("/v1/failpoints", |r| {
             request_span(r, move |r| async {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 1d30c45278..f8994a8dcc 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2150,6 +2150,18 @@ class NeonStorageController(MetricsGetter):
         shards: list[dict[str, Any]] = body["shards"]
         return shards
 
+    def tenant_describe(self, tenant_id: TenantId):
+        """
+        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int}
+        """
+        response = self.request(
+            "GET",
+            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        response.raise_for_status()
+        return response.json()
+
     def tenant_shard_split(
         self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None
     ) -> list[TenantShardId]:
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 99ec894106..6aebfbc99c 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -626,6 +626,17 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         res_json = res.json()
         return res_json
 
+    def timeline_layer_map_info(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    ):
+        log.info(f"Requesting layer map info of tenant {tenant_id}, timeline {timeline_id}")
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer",
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        return res_json
+
     def timeline_checkpoint(
         self,
         tenant_id: Union[TenantId, TenantShardId],
diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py
index 48dd84fb06..be56203b26 100644
--- a/test_runner/performance/test_gc_feedback.py
+++ b/test_runner/performance/test_gc_feedback.py
@@ -1,3 +1,5 @@
+import json
+
 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.log_helper import log
@@ -79,3 +81,8 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
     zenbenchmark.record(
         "physical/logical ratio", physical_size / logical_size, "", MetricReport.LOWER_IS_BETTER
     )
+
+    layer_map_path = env.repo_dir / "layer-map.json"
+    log.info(f"Writing layer map to {layer_map_path}")
+    with layer_map_path.open("w") as f:
+        f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id)))
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index cb58c640c3..e6318aff68 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1,5 +1,6 @@
 import os
 import time
+from collections import defaultdict
 from typing import Dict, List, Optional, Union
 
 import pytest
@@ -13,7 +14,7 @@ from fixtures.neon_fixtures import (
     tenant_get_shards,
 )
 from fixtures.remote_storage import s3_storage
-from fixtures.types import Lsn, TenantShardId, TimelineId
+from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
 from pytest_httpserver import HTTPServer
@@ -159,11 +160,20 @@ def test_sharding_split_smoke(
 
     neon_env_builder.preserve_database_files = True
 
-    env = neon_env_builder.init_start(
-        initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size
+    non_default_tenant_config = {"gc_horizon": 77 * 1024 * 1024}
+
+    env = neon_env_builder.init_configs(True)
+    neon_env_builder.start()
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_tenant(
+        tenant_id,
+        timeline_id,
+        shard_count=shard_count,
+        shard_stripe_size=stripe_size,
+        placement_policy='{"Attached": 1}',
+        conf=non_default_tenant_config,
     )
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
     workload = Workload(env, tenant_id, timeline_id, branch_name="main")
     workload.init()
 
@@ -223,6 +233,14 @@ def test_sharding_split_smoke(
     # Before split, old shards exist
     assert shards_on_disk(old_shard_ids)
 
+    # Before split, we have done one reconcile for each shard
+    assert (
+        env.storage_controller.get_metric_value(
+            "storage_controller_reconcile_complete_total", filter={"status": "ok"}
+        )
+        == shard_count
+    )
+
     env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count)
 
     post_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)]
@@ -268,39 +286,62 @@ def test_sharding_split_smoke(
 
     workload.validate()
 
-    # Check that we didn't do any spurious reconciliations.
-    # Total number of reconciles should have been one per original shard, plus
-    # one for each shard that was migrated.
+    # Assert on how many reconciles happened during the process.  This is something of an
+    # implementation detail, but it is useful to detect any bugs that might generate spurious
+    # extra reconcile iterations.
+    #
+    # We'll have:
+    # - shard_count reconciles for the original setup of the tenant
+    # - shard_count reconciles for detaching the original secondary locations during split
+    # - split_shard_count reconciles during shard splitting, for setting up secondaries.
+    # - shard_count reconciles for the migrations we did to move child shards away from their split location
+    expect_reconciles = shard_count * 2 + split_shard_count + shard_count
     reconcile_ok = env.storage_controller.get_metric_value(
         "storage_controller_reconcile_complete_total", filter={"status": "ok"}
     )
-    assert reconcile_ok == shard_count + split_shard_count // 2
+    assert reconcile_ok == expect_reconciles
 
     # Check that no cancelled or errored reconciliations occurred: this test does no
     # failure injection and should run clean.
-    assert (
-        env.storage_controller.get_metric_value(
-            "storage_controller_reconcile_complete_total", filter={"status": "cancel"}
-        )
-        is None
+    cancelled_reconciles = env.storage_controller.get_metric_value(
+        "storage_controller_reconcile_complete_total", filter={"status": "cancel"}
     )
-    assert (
-        env.storage_controller.get_metric_value(
-            "storage_controller_reconcile_complete_total", filter={"status": "error"}
-        )
-        is None
+    errored_reconciles = env.storage_controller.get_metric_value(
+        "storage_controller_reconcile_complete_total", filter={"status": "error"}
     )
+    assert cancelled_reconciles is not None and int(cancelled_reconciles) == 0
+    assert errored_reconciles is not None and int(errored_reconciles) == 0
 
     env.storage_controller.consistency_check()
 
-    # Validate pageserver state
-    shards_exist: list[TenantShardId] = []
-    for pageserver in env.pageservers:
-        locations = pageserver.http_client().tenant_list_locations()
-        shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"])
+    def get_node_shard_counts(env: NeonEnv, tenant_ids):
+        total: defaultdict[int, int] = defaultdict(int)
+        attached: defaultdict[int, int] = defaultdict(int)
+        for tid in tenant_ids:
+            for shard in env.storage_controller.tenant_describe(tid)["shards"]:
+                log.info(
+                    f"{shard['tenant_shard_id']}: attached={shard['node_attached']}, secondary={shard['node_secondary']} "
+                )
+                for node in shard["node_secondary"]:
+                    total[int(node)] += 1
+                attached[int(shard["node_attached"])] += 1
+                total[int(shard["node_attached"])] += 1
 
-    log.info(f"Shards after split: {shards_exist}")
-    assert len(shards_exist) == split_shard_count
+        return total, attached
+
+    def check_effective_tenant_config():
+        # Expect our custom tenant configs to have survived the split
+        for shard in env.storage_controller.tenant_describe(tenant_id)["shards"]:
+            node = env.get_pageserver(int(shard["node_attached"]))
+            config = node.http_client().tenant_config(TenantShardId.parse(shard["tenant_shard_id"]))
+            for k, v in non_default_tenant_config.items():
+                assert config.effective_config[k] == v
+
+    # Validate pageserver state: expect every child shard to have an attached and secondary location
+    (total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id])
+    assert sum(attached.values()) == split_shard_count
+    assert sum(total.values()) == split_shard_count * 2
+    check_effective_tenant_config()
 
     # Ensure post-split pageserver locations survive a restart (i.e. the child shards
     # correctly wrote config to disk, and the storage controller responds correctly
@@ -309,13 +350,11 @@ def test_sharding_split_smoke(
         pageserver.stop()
         pageserver.start()
 
-    shards_exist = []
-    for pageserver in env.pageservers:
-        locations = pageserver.http_client().tenant_list_locations()
-        shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"])
-
-    log.info("Shards after restart: {shards_exist}")
-    assert len(shards_exist) == split_shard_count
+    # Validate pageserver state: expect every child shard to have an attached and secondary location
+    (total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id])
+    assert sum(attached.values()) == split_shard_count
+    assert sum(total.values()) == split_shard_count * 2
+    check_effective_tenant_config()
 
     workload.validate()
 
@@ -721,9 +760,16 @@ def test_sharding_split_failures(
     initial_shard_count = 2
     split_shard_count = 4
 
-    env = neon_env_builder.init_start(initial_tenant_shard_count=initial_shard_count)
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+
+    # Create a tenant with secondary locations enabled
+    env.neon_cli.create_tenant(
+        tenant_id, timeline_id, shard_count=initial_shard_count, placement_policy='{"Attached":1}'
+    )
 
     env.storage_controller.allowed_errors.extend(
         [
@@ -736,6 +782,8 @@ def test_sharding_split_failures(
             ".*failpoint.*",
             # Node offline cases will fail to send requests
             ".*Reconcile error: receive body: error sending request for url.*",
+            # Node offline cases will fail inside reconciler when detaching secondaries
+            ".*Reconcile error on shard.*: receive body: error sending request for url.*",
         ]
     )
 
@@ -773,7 +821,8 @@ def test_sharding_split_failures(
     # will have succeeded: the net result should be to return to a clean state, including
     # detaching any child shards.
     def assert_rolled_back(exclude_ps_id=None) -> None:
-        count = 0
+        secondary_count = 0
+        attached_count = 0
         for ps in env.pageservers:
             if exclude_ps_id is not None and ps.id == exclude_ps_id:
                 continue
@@ -781,13 +830,25 @@ def test_sharding_split_failures(
             locations = ps.http_client().tenant_list_locations()["tenant_shards"]
             for loc in locations:
                 tenant_shard_id = TenantShardId.parse(loc[0])
-                log.info(f"Shard {tenant_shard_id} seen on node {ps.id}")
+                log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
                 assert tenant_shard_id.shard_count == initial_shard_count
-                count += 1
-        assert count == initial_shard_count
+                if loc[1]["mode"] == "Secondary":
+                    secondary_count += 1
+                else:
+                    attached_count += 1
+
+        if exclude_ps_id is not None:
+            # For a node failure case, we expect there to be a secondary location
+            # scheduled on the offline node, so expect one fewer secondary in total
+            assert secondary_count == initial_shard_count - 1
+        else:
+            assert secondary_count == initial_shard_count
+
+        assert attached_count == initial_shard_count
 
     def assert_split_done(exclude_ps_id=None) -> None:
-        count = 0
+        secondary_count = 0
+        attached_count = 0
         for ps in env.pageservers:
             if exclude_ps_id is not None and ps.id == exclude_ps_id:
                 continue
@@ -795,10 +856,14 @@ def test_sharding_split_failures(
             locations = ps.http_client().tenant_list_locations()["tenant_shards"]
             for loc in locations:
                 tenant_shard_id = TenantShardId.parse(loc[0])
-                log.info(f"Shard {tenant_shard_id} seen on node {ps.id}")
+                log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
                 assert tenant_shard_id.shard_count == split_shard_count
-                count += 1
-        assert count == split_shard_count
+                if loc[1]["mode"] == "Secondary":
+                    secondary_count += 1
+                else:
+                    attached_count += 1
+        assert attached_count == split_shard_count
+        assert secondary_count == split_shard_count
 
     def finish_split():
         # Having failed+rolled back, we should be able to split again
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index a6b0f76c96..b7488cadd6 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -23,7 +23,7 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
-from fixtures.types import TenantId, TimelineId
+from fixtures.types import TenantId, TenantShardId, TimelineId
 from fixtures.utils import run_pg_bench_small, wait_until
 from mypy_boto3_s3.type_defs import (
     ObjectTypeDef,
@@ -948,3 +948,65 @@ def test_sharding_service_heartbeats(
         env.storage_controller.consistency_check()
 
     wait_until(10, 1, storage_controller_consistent)
+
+
+def test_sharding_service_re_attach(neon_env_builder: NeonEnvBuilder):
+    """
+    Exercise the behavior of the /re-attach endpoint on pageserver startup when
+    pageservers have a mixture of attached and secondary locations
+    """
+
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    # We'll have two tenants.
+    tenant_a = TenantId.generate()
+    env.neon_cli.create_tenant(tenant_a, placement_policy='{"Attached":1}')
+    tenant_b = TenantId.generate()
+    env.neon_cli.create_tenant(tenant_b, placement_policy='{"Attached":1}')
+
+    # Each pageserver will have one attached and one secondary location
+    env.storage_controller.tenant_shard_migrate(
+        TenantShardId(tenant_a, 0, 0), env.pageservers[0].id
+    )
+    env.storage_controller.tenant_shard_migrate(
+        TenantShardId(tenant_b, 0, 0), env.pageservers[1].id
+    )
+
+    # Hard-fail a pageserver
+    victim_ps = env.pageservers[1]
+    survivor_ps = env.pageservers[0]
+    victim_ps.stop(immediate=True)
+
+    # Heatbeater will notice it's offline, and consequently attachments move to the other pageserver
+    def failed_over():
+        locations = survivor_ps.http_client().tenant_list_locations()["tenant_shards"]
+        log.info(f"locations: {locations}")
+        assert len(locations) == 2
+        assert all(loc[1]["mode"] == "AttachedSingle" for loc in locations)
+
+    # We could pre-empty this by configuring the node to Offline, but it's preferable to test
+    # the realistic path we would take when a node restarts uncleanly.
+    # The delay here will be ~NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL in neon_local
+    wait_until(30, 1, failed_over)
+
+    reconciles_before_restart = env.storage_controller.get_metric_value(
+        "storage_controller_reconcile_complete_total", filter={"status": "ok"}
+    )
+
+    # Restart the failed pageserver
+    victim_ps.start()
+
+    # We expect that the re-attach call correctly tipped off the pageserver that its locations
+    # are all secondaries now.
+    locations = victim_ps.http_client().tenant_list_locations()["tenant_shards"]
+    assert len(locations) == 2
+    assert all(loc[1]["mode"] == "Secondary" for loc in locations)
+
+    # We expect that this situation resulted from the re_attach call, and not any explicit
+    # Reconciler runs: assert that the reconciliation count has not gone up since we restarted.
+    reconciles_after_restart = env.storage_controller.get_metric_value(
+        "storage_controller_reconcile_complete_total", filter={"status": "ok"}
+    )
+    assert reconciles_after_restart == reconciles_before_restart