Merge pull request #7173 from neondatabase/rc/proxy/2024-03-19

Proxy release 2024-03-19
Merge pull request #7119 from neondatabase/rc/proxy/2024-03-14
2026-05-20 22:50:38 +00:00 · 2024-03-19 12:11:42 +00:00 · 2024-03-14 14:57:05 +05:00 · 2024-03-14 14:16:36 +05:00 · 2024-03-08 08:19:16 +00:00 · 2024-03-04 17:36:11 +04:00
88 changed files with 1593 additions and 4082 deletions
--- a/3
+++ b/3
@@ -1,13 +1,12 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
 /control_plane/attachment_service @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
+/libs/postgres_ffi/ @neondatabase/compute
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/safekeepers
 /libs/vm_monitor/ @neondatabase/autoscaling
 /pageserver/ @neondatabase/storage
 /pgxn/ @neondatabase/compute
-/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
 /proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/safekeepers
 /vendor/ @neondatabase/compute
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -277,7 +277,6 @@ dependencies = [
 "anyhow",
 "aws-config",
 "aws-sdk-secretsmanager",
- "bytes",
 "camino",
 "clap",
 "control_plane",
@@ -289,8 +288,6 @@ dependencies = [
 "hex",
 "humantime",
 "hyper",
- "lasso",
- "measured",
 "metrics",
 "once_cell",
 "pageserver_api",
@@ -298,7 +295,6 @@ dependencies = [
 "postgres_connection",
 "r2d2",
 "reqwest",
- "routerify",
 "serde",
 "serde_json",
 "thiserror",
@@ -2884,35 +2880,6 @@ version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

-[[package]]
-name = "measured"
-version = "0.0.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f"
-dependencies = [
- "bytes",
- "hashbrown 0.14.0",
- "itoa",
- "lasso",
- "measured-derive",
- "memchr",
- "parking_lot 0.12.1",
- "rustc-hash",
- "ryu",
-]
-
-[[package]]
-name = "measured-derive"
-version = "0.0.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80"
-dependencies = [
- "heck",
- "proc-macro2",
- "quote",
- "syn 2.0.52",
-]
-
 [[package]]
 name = "memchr"
 version = "2.6.4"
@@ -3934,7 +3901,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -3947,7 +3914,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
 dependencies = [
 "native-tls",
 "tokio",
@@ -3958,7 +3925,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -3971,13 +3938,12 @@ dependencies = [
 "rand 0.8.5",
 "sha2",
 "stringprep",
- "tokio",
 ]

 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4209,7 +4175,6 @@ dependencies = [
 "consumption_metrics",
 "dashmap",
 "env_logger",
- "fallible-iterator",
 "futures",
 "git-version",
 "hashbrown 0.13.2",
@@ -4260,7 +4225,6 @@ dependencies = [
 "smallvec",
 "smol_str",
 "socket2 0.5.5",
- "subtle",
 "sync_wrapper",
 "task-local-extensions",
 "thiserror",
@@ -5382,23 +5346,13 @@ checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"

 [[package]]
 name = "sha2"
-version = "0.10.8"
+version = "0.10.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
+checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0"
 dependencies = [
 "cfg-if",
 "cpufeatures",
 "digest",
- "sha2-asm",
-]
-
-[[package]]
-name = "sha2-asm"
-version = "0.6.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f27ba7066011e3fb30d808b51affff34f0a66d3a03a58edd787c6e420e40e44e"
-dependencies = [
- "cc",
 ]

 [[package]]
@@ -5981,7 +5935,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -6514,7 +6468,6 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "arc-swap",
- "async-compression",
 "async-trait",
 "bincode",
 "byteorder",
@@ -6553,14 +6506,12 @@ dependencies = [
 "thiserror",
 "tokio",
 "tokio-stream",
- "tokio-tar",
 "tokio-util",
 "tracing",
 "tracing-error",
 "tracing-subscriber",
 "url",
 "uuid",
- "walkdir",
 "workspace_hack",
 ]

@@ -7078,7 +7029,6 @@ dependencies = [
 "scopeguard",
 "serde",
 "serde_json",
- "sha2",
 "smallvec",
 "subtle",
 "syn 1.0.109",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -76,7 +76,6 @@ either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
 fail = "0.5.0"
-fallible-iterator = "0.2"
 fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
@@ -102,7 +101,6 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.13", features=["default", "lasso"] }
 memoffset = "0.8"
 native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
@@ -150,7 +148,6 @@ smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
-"subtle"  = "2.5.0"
 svg_fmt = "0.4.1"
 sync_wrapper = "0.1.2"
 tar = "0.4"
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -135,7 +135,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.77.0
+ENV RUSTC_VERSION=1.76.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
@@ -149,7 +149,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    cargo install --git https://github.com/paritytech/cachepot && \
    cargo install rustfilt && \
    cargo install cargo-hakari && \
-    cargo install cargo-deny --locked && \
+    cargo install cargo-deny && \
    cargo install cargo-hack && \
    cargo install cargo-nextest && \
    rm -rf /home/nonroot/.cargo/registry && \
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -32,29 +32,6 @@ compute_ctl -D /var/db/postgres/compute \
            -b /usr/local/bin/postgres
 ```

-## State Diagram
-
-Computes can be in various states. Below is a diagram that details how a
-compute moves between states.
-
-```mermaid
-%% https://mermaid.js.org/syntax/stateDiagram.html
-stateDiagram-v2
-  [*] --> Empty : Compute spawned
-  Empty --> ConfigurationPending : Waiting for compute spec
-  ConfigurationPending --> Configuration : Received compute spec
-  Configuration --> Failed : Failed to configure the compute
-  Configuration --> Running : Compute has been configured
-  Empty --> Init : Compute spec is immediately available
-  Empty --> TerminationPending : Requested termination
-  Init --> Failed : Failed to start Postgres
-  Init --> Running : Started Postgres
-  Running --> TerminationPending : Requested termination
-  TerminationPending --> Terminated : Terminated compute
-  Failed --> [*] : Compute exited
-  Terminated --> [*] : Compute exited
-```
-
 ## Tests

 Cargo formatter:
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -17,7 +17,6 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
        .write(true)
        .create(true)
        .append(false)
-        .truncate(false)
        .open(path)?;
    let buf = io::BufReader::new(&file);
    let mut count: usize = 0;
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -17,7 +17,6 @@ testing = []
 anyhow.workspace = true
 aws-config.workspace = true
 aws-sdk-secretsmanager.workspace = true
-bytes.workspace = true
 camino.workspace = true
 clap.workspace = true
 fail.workspace = true
@@ -26,20 +25,17 @@ git-version.workspace = true
 hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
-lasso.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
 reqwest.workspace = true
-routerify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
-measured.workspace = true

 diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
--- a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
+++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
@@ -1,3 +0,0 @@
-
-UPDATE tenant_shards set placement_policy='{"Double": 1}' where placement_policy='{"Attached": 1}';
-UPDATE tenant_shards set placement_policy='"Single"' where placement_policy='{"Attached": 0}';
--- a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql
+++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql
@@ -1,3 +0,0 @@
-
-UPDATE tenant_shards set placement_policy='{"Attached": 1}' where placement_policy='{"Double": 1}';
-UPDATE tenant_shards set placement_policy='{"Attached": 0}' where placement_policy='"Single"';
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,11 +1,5 @@
-use crate::metrics::{
-    HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
-    METRICS_REGISTRY,
-};
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
-use futures::Future;
-use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
@@ -40,8 +34,6 @@ use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};

 use control_plane::storage_controller::{AttachHookRequest, InspectRequest};

-use routerify::Middleware;
-
 /// State available to HTTP request handlers
 #[derive(Clone)]
 pub struct HttpState {
@@ -321,7 +313,7 @@ async fn handle_tenant_timeline_passthrough(
    tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);

    // Find the node that holds shard zero
-    let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id)?;
+    let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;

    // Callers will always pass an unsharded tenant ID.  Before proxying, we must
    // rewrite this to a shard-aware shard zero ID.
@@ -330,39 +322,12 @@ async fn handle_tenant_timeline_passthrough(
    let tenant_shard_str = format!("{}", tenant_shard_id);
    let path = path.replace(&tenant_str, &tenant_shard_str);

-    let latency = &METRICS_REGISTRY
-        .metrics_group
-        .storage_controller_passthrough_request_latency;
-
-    // This is a bit awkward. We remove the param from the request
-    // and join the words by '_' to get a label for the request.
-    let just_path = path.replace(&tenant_shard_str, "");
-    let path_label = just_path
-        .split('/')
-        .filter(|token| !token.is_empty())
-        .collect::<Vec<_>>()
-        .join("_");
-    let labels = PageserverRequestLabelGroup {
-        pageserver_id: &node.get_id().to_string(),
-        path: &path_label,
-        method: crate::metrics::Method::Get,
-    };
-
-    let _timer = latency.start_timer(labels.clone());
-
-    let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref());
+    let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
    let resp = client.get_raw(path).await.map_err(|_e|
        // FIXME: give APiError a proper Unavailable variant.  We return 503 here because
        // if we can't successfully send a request to the pageserver, we aren't available.
        ApiError::ShuttingDown)?;

-    if !resp.status().is_success() {
-        let error_counter = &METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_passthrough_request_error;
-        error_counter.inc(labels);
-    }
-
    // We have a reqest::Response, would like a http::Response
    let mut builder = hyper::Response::builder()
        .status(resp.status())
@@ -388,16 +353,6 @@ async fn handle_tenant_locate(
    json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }

-async fn handle_tenant_describe(
-    service: Arc<Service>,
-    req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
-}
-
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -533,11 +488,7 @@ impl From<ReconcileError> for ApiError {

 /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
 /// be allowed to run if Service has finished its initial reconciliation.
-async fn tenant_service_handler<R, H>(
-    request: Request<Body>,
-    handler: H,
-    request_name: RequestName,
-) -> R::Output
+async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
 where
    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
    H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
@@ -557,10 +508,9 @@ where
        ));
    }

-    named_request_span(
+    request_span(
        request,
        |request| async move { handler(service, request).await },
-        request_name,
    )
    .await
 }
@@ -571,98 +521,11 @@ fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(
    })
 }

-#[derive(Clone, Debug)]
-struct RequestMeta {
-    method: hyper::http::Method,
-    at: Instant,
-}
-
-fn prologue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
-) -> Middleware<B, ApiError> {
-    Middleware::pre(move |req| async move {
-        let meta = RequestMeta {
-            method: req.method().clone(),
-            at: Instant::now(),
-        };
-
-        req.set_context(meta);
-
-        Ok(req)
-    })
-}
-
-fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
-) -> Middleware<B, ApiError> {
-    Middleware::post_with_info(move |resp, req_info| async move {
-        let request_name = match req_info.context::<RequestName>() {
-            Some(name) => name,
-            None => {
-                return Ok(resp);
-            }
-        };
-
-        if let Some(meta) = req_info.context::<RequestMeta>() {
-            let status = &crate::metrics::METRICS_REGISTRY
-                .metrics_group
-                .storage_controller_http_request_status;
-            let latency = &crate::metrics::METRICS_REGISTRY
-                .metrics_group
-                .storage_controller_http_request_latency;
-
-            status.inc(HttpRequestStatusLabelGroup {
-                path: request_name.0,
-                method: meta.method.clone().into(),
-                status: crate::metrics::StatusCode(resp.status()),
-            });
-
-            latency.observe(
-                HttpRequestLatencyLabelGroup {
-                    path: request_name.0,
-                    method: meta.method.into(),
-                },
-                meta.at.elapsed().as_secs_f64(),
-            );
-        }
-        Ok(resp)
-    })
-}
-
-pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
-
-    let payload = crate::metrics::METRICS_REGISTRY.encode();
-    let response = Response::builder()
-        .status(200)
-        .header(CONTENT_TYPE, TEXT_FORMAT)
-        .body(payload.into())
-        .unwrap();
-
-    Ok(response)
-}
-
-#[derive(Clone)]
-struct RequestName(&'static str);
-
-async fn named_request_span<R, H>(
-    request: Request<Body>,
-    handler: H,
-    name: RequestName,
-) -> R::Output
-where
-    R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
-    H: FnOnce(Request<Body>) -> R + Send + Sync + 'static,
-{
-    request.set_context(name);
-    request_span(request, handler).await
-}
-
 pub fn make_router(
    service: Arc<Service>,
    auth: Option<Arc<SwappableJwtAuth>>,
 ) -> RouterBuilder<hyper::Body, ApiError> {
-    let mut router = endpoint::make_router()
-        .middleware(prologue_metrics_middleware())
-        .middleware(epilogue_metrics_middleware());
+    let mut router = endpoint::make_router();
    if auth.is_some() {
        router = router.middleware(auth_middleware(|request| {
            let state = get_state(request);
@@ -671,166 +534,96 @@ pub fn make_router(
            } else {
                state.auth.as_deref()
            }
-        }));
+        }))
    }

    router
        .data(Arc::new(HttpState::new(service, auth)))
-        .get("/metrics", |r| {
-            named_request_span(r, measured_metrics_handler, RequestName("metrics"))
-        })
        // Non-prefixed generic endpoints (status, metrics)
-        .get("/status", |r| {
-            named_request_span(r, handle_status, RequestName("status"))
-        })
-        .get("/ready", |r| {
-            named_request_span(r, handle_ready, RequestName("ready"))
-        })
+        .get("/status", |r| request_span(r, handle_status))
+        .get("/ready", |r| request_span(r, handle_ready))
        // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
        .post("/upcall/v1/re-attach", |r| {
-            named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach"))
-        })
-        .post("/upcall/v1/validate", |r| {
-            named_request_span(r, handle_validate, RequestName("upcall_v1_validate"))
+            request_span(r, handle_re_attach)
        })
+        .post("/upcall/v1/validate", |r| request_span(r, handle_validate))
        // Test/dev/debug endpoints
        .post("/debug/v1/attach-hook", |r| {
-            named_request_span(r, handle_attach_hook, RequestName("debug_v1_attach_hook"))
-        })
-        .post("/debug/v1/inspect", |r| {
-            named_request_span(r, handle_inspect, RequestName("debug_v1_inspect"))
+            request_span(r, handle_attach_hook)
        })
+        .post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
        .post("/debug/v1/tenant/:tenant_id/drop", |r| {
-            named_request_span(r, handle_tenant_drop, RequestName("debug_v1_tenant_drop"))
+            request_span(r, handle_tenant_drop)
        })
        .post("/debug/v1/node/:node_id/drop", |r| {
-            named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
-        })
-        .get("/debug/v1/tenant", |r| {
-            named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
-        })
-        .get("/debug/v1/tenant/:tenant_id/locate", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_locate,
-                RequestName("debug_v1_tenant_locate"),
-            )
+            request_span(r, handle_node_drop)
        })
+        .get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
        .get("/debug/v1/scheduler", |r| {
-            named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
+            request_span(r, handle_scheduler_dump)
        })
        .post("/debug/v1/consistency_check", |r| {
-            named_request_span(
-                r,
-                handle_consistency_check,
-                RequestName("debug_v1_consistency_check"),
-            )
+            request_span(r, handle_consistency_check)
        })
        .put("/debug/v1/failpoints", |r| {
            request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
        })
+        .get("/control/v1/tenant/:tenant_id/locate", |r| {
+            tenant_service_handler(r, handle_tenant_locate)
+        })
        // Node operations
        .post("/control/v1/node", |r| {
-            named_request_span(r, handle_node_register, RequestName("control_v1_node"))
-        })
-        .get("/control/v1/node", |r| {
-            named_request_span(r, handle_node_list, RequestName("control_v1_node"))
+            request_span(r, handle_node_register)
        })
+        .get("/control/v1/node", |r| request_span(r, handle_node_list))
        .put("/control/v1/node/:node_id/config", |r| {
-            named_request_span(
-                r,
-                handle_node_configure,
-                RequestName("control_v1_node_config"),
-            )
+            request_span(r, handle_node_configure)
        })
        // Tenant Shard operations
        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_shard_migrate,
-                RequestName("control_v1_tenant_migrate"),
-            )
+            tenant_service_handler(r, handle_tenant_shard_migrate)
        })
        .put("/control/v1/tenant/:tenant_id/shard_split", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_shard_split,
-                RequestName("control_v1_tenant_shard_split"),
-            )
-        })
-        .get("/control/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_describe,
-                RequestName("control_v1_tenant_describe"),
-            )
+            tenant_service_handler(r, handle_tenant_shard_split)
        })
        // Tenant operations
        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
        .post("/v1/tenant", |r| {
-            tenant_service_handler(r, handle_tenant_create, RequestName("v1_tenant"))
+            tenant_service_handler(r, handle_tenant_create)
        })
        .delete("/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant"))
+            tenant_service_handler(r, handle_tenant_delete)
        })
        .put("/v1/tenant/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config"))
+            tenant_service_handler(r, handle_tenant_config_set)
        })
        .get("/v1/tenant/:tenant_id/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_get, RequestName("v1_tenant_config"))
+            tenant_service_handler(r, handle_tenant_config_get)
        })
        .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_location_config,
-                RequestName("v1_tenant_location_config"),
-            )
+            tenant_service_handler(r, handle_tenant_location_config)
        })
        .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_time_travel_remote_storage,
-                RequestName("v1_tenant_time_travel_remote_storage"),
-            )
+            tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
        })
        .post("/v1/tenant/:tenant_id/secondary/download", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_secondary_download,
-                RequestName("v1_tenant_secondary_download"),
-            )
+            tenant_service_handler(r, handle_tenant_secondary_download)
        })
        // Timeline operations
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_timeline_delete,
-                RequestName("v1_tenant_timeline"),
-            )
+            tenant_service_handler(r, handle_tenant_timeline_delete)
        })
        .post("/v1/tenant/:tenant_id/timeline", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_timeline_create,
-                RequestName("v1_tenant_timeline"),
-            )
+            tenant_service_handler(r, handle_tenant_timeline_create)
        })
        // Tenant detail GET passthrough to shard zero
        .get("/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_timeline_passthrough,
-                RequestName("v1_tenant_passthrough"),
-            )
+            tenant_service_handler(r, handle_tenant_timeline_passthrough)
        })
        // Timeline GET passthrough to shard zero.  Note that the `*` in the URL is a wildcard: any future
        // timeline GET APIs will be implicitly included.
        .get("/v1/tenant/:tenant_id/timeline*", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_timeline_passthrough,
-                RequestName("v1_tenant_timeline_passthrough"),
-            )
+            tenant_service_handler(r, handle_tenant_timeline_passthrough)
        })
 }
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -8,7 +8,6 @@ pub mod http;
 mod id_lock_map;
 pub mod metrics;
 mod node;
-mod pageserver_client;
 pub mod persistence;
 mod reconciler;
 mod scheduler;
--- a/control_plane/attachment_service/src/metrics.rs
+++ b/control_plane/attachment_service/src/metrics.rs
@@ -1,284 +1,32 @@
-//!
-//! This module provides metric definitions for the storage controller.
-//!
-//! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds
-//! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`]
-//! constant.
-//!
-//! The rest of the code defines label group types and deals with converting outer types to labels.
-//!
-use bytes::Bytes;
-use measured::{
-    label::{LabelValue, StaticLabelSet},
-    FixedCardinalityLabel, MetricGroup,
-};
+use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
-use std::sync::Mutex;

-use crate::persistence::{DatabaseError, DatabaseOperation};
+pub(crate) struct ReconcilerMetrics {
+    pub(crate) spawned: IntCounter,
+    pub(crate) complete: IntCounterVec,
+}

-pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
-    Lazy::new(StorageControllerMetrics::default);
+impl ReconcilerMetrics {
+    // Labels used on [`Self::complete`]
+    pub(crate) const SUCCESS: &'static str = "ok";
+    pub(crate) const ERROR: &'static str = "success";
+    pub(crate) const CANCEL: &'static str = "cancel";
+}
+
+pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
+    spawned: register_int_counter!(
+        "storage_controller_reconcile_spawn",
+        "Count of how many times we spawn a reconcile task",
+    )
+    .expect("failed to define a metric"),
+    complete: register_int_counter_vec!(
+        "storage_controller_reconcile_complete",
+        "Reconciler tasks completed, broken down by success/failure/cancelled",
+        &["status"],
+    )
+    .expect("failed to define a metric"),
+});

 pub fn preinitialize_metrics() {
-    Lazy::force(&METRICS_REGISTRY);
-}
-
-pub(crate) struct StorageControllerMetrics {
-    pub(crate) metrics_group: StorageControllerMetricGroup,
-    encoder: Mutex<measured::text::TextEncoder>,
-}
-
-#[derive(measured::MetricGroup)]
-pub(crate) struct StorageControllerMetricGroup {
-    /// Count of how many times we spawn a reconcile task
-    pub(crate) storage_controller_reconcile_spawn: measured::Counter,
-    /// Reconciler tasks completed, broken down by success/failure/cancelled
-    pub(crate) storage_controller_reconcile_complete:
-        measured::CounterVec<ReconcileCompleteLabelGroupSet>,
-
-    /// HTTP request status counters for handled requests
-    pub(crate) storage_controller_http_request_status:
-        measured::CounterVec<HttpRequestStatusLabelGroupSet>,
-    /// HTTP request handler latency across all status codes
-    pub(crate) storage_controller_http_request_latency:
-        measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
-
-    /// Count of HTTP requests to the pageserver that resulted in an error,
-    /// broken down by the pageserver node id, request name and method
-    pub(crate) storage_controller_pageserver_request_error:
-        measured::CounterVec<PageserverRequestLabelGroupSet>,
-
-    /// Latency of HTTP requests to the pageserver, broken down by pageserver
-    /// node id, request name and method. This include both successful and unsuccessful
-    /// requests.
-    pub(crate) storage_controller_pageserver_request_latency:
-        measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
-
-    /// Count of pass-through HTTP requests to the pageserver that resulted in an error,
-    /// broken down by the pageserver node id, request name and method
-    pub(crate) storage_controller_passthrough_request_error:
-        measured::CounterVec<PageserverRequestLabelGroupSet>,
-
-    /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
-    /// node id, request name and method. This include both successful and unsuccessful
-    /// requests.
-    pub(crate) storage_controller_passthrough_request_latency:
-        measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
-
-    /// Count of errors in database queries, broken down by error type and operation.
-    pub(crate) storage_controller_database_query_error:
-        measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
-
-    /// Latency of database queries, broken down by operation.
-    pub(crate) storage_controller_database_query_latency:
-        measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
-}
-
-impl StorageControllerMetrics {
-    pub(crate) fn encode(&self) -> Bytes {
-        let mut encoder = self.encoder.lock().unwrap();
-        self.metrics_group.collect_into(&mut *encoder);
-        encoder.finish()
-    }
-}
-
-impl Default for StorageControllerMetrics {
-    fn default() -> Self {
-        Self {
-            metrics_group: StorageControllerMetricGroup::new(),
-            encoder: Mutex::new(measured::text::TextEncoder::new()),
-        }
-    }
-}
-
-impl StorageControllerMetricGroup {
-    pub(crate) fn new() -> Self {
-        Self {
-            storage_controller_reconcile_spawn: measured::Counter::new(),
-            storage_controller_reconcile_complete: measured::CounterVec::new(
-                ReconcileCompleteLabelGroupSet {
-                    status: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_http_request_status: measured::CounterVec::new(
-                HttpRequestStatusLabelGroupSet {
-                    path: lasso::ThreadedRodeo::new(),
-                    method: StaticLabelSet::new(),
-                    status: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_http_request_latency: measured::HistogramVec::new(
-                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
-            ),
-            storage_controller_pageserver_request_error: measured::CounterVec::new(
-                PageserverRequestLabelGroupSet {
-                    pageserver_id: lasso::ThreadedRodeo::new(),
-                    path: lasso::ThreadedRodeo::new(),
-                    method: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_pageserver_request_latency: measured::HistogramVec::new(
-                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
-            ),
-            storage_controller_passthrough_request_error: measured::CounterVec::new(
-                PageserverRequestLabelGroupSet {
-                    pageserver_id: lasso::ThreadedRodeo::new(),
-                    path: lasso::ThreadedRodeo::new(),
-                    method: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_passthrough_request_latency: measured::HistogramVec::new(
-                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
-            ),
-            storage_controller_database_query_error: measured::CounterVec::new(
-                DatabaseQueryErrorLabelGroupSet {
-                    operation: StaticLabelSet::new(),
-                    error_type: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_database_query_latency: measured::HistogramVec::new(
-                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
-            ),
-        }
-    }
-}
-
-#[derive(measured::LabelGroup)]
-#[label(set = ReconcileCompleteLabelGroupSet)]
-pub(crate) struct ReconcileCompleteLabelGroup {
-    pub(crate) status: ReconcileOutcome,
-}
-
-#[derive(measured::LabelGroup)]
-#[label(set = HttpRequestStatusLabelGroupSet)]
-pub(crate) struct HttpRequestStatusLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo)]
-    pub(crate) path: &'a str,
-    pub(crate) method: Method,
-    pub(crate) status: StatusCode,
-}
-
-#[derive(measured::LabelGroup)]
-#[label(set = HttpRequestLatencyLabelGroupSet)]
-pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo)]
-    pub(crate) path: &'a str,
-    pub(crate) method: Method,
-}
-
-impl Default for HttpRequestLatencyLabelGroupSet {
-    fn default() -> Self {
-        Self {
-            path: lasso::ThreadedRodeo::new(),
-            method: StaticLabelSet::new(),
-        }
-    }
-}
-
-#[derive(measured::LabelGroup, Clone)]
-#[label(set = PageserverRequestLabelGroupSet)]
-pub(crate) struct PageserverRequestLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo)]
-    pub(crate) pageserver_id: &'a str,
-    #[label(dynamic_with = lasso::ThreadedRodeo)]
-    pub(crate) path: &'a str,
-    pub(crate) method: Method,
-}
-
-impl Default for PageserverRequestLabelGroupSet {
-    fn default() -> Self {
-        Self {
-            pageserver_id: lasso::ThreadedRodeo::new(),
-            path: lasso::ThreadedRodeo::new(),
-            method: StaticLabelSet::new(),
-        }
-    }
-}
-
-#[derive(measured::LabelGroup)]
-#[label(set = DatabaseQueryErrorLabelGroupSet)]
-pub(crate) struct DatabaseQueryErrorLabelGroup {
-    pub(crate) error_type: DatabaseErrorLabel,
-    pub(crate) operation: DatabaseOperation,
-}
-
-#[derive(measured::LabelGroup)]
-#[label(set = DatabaseQueryLatencyLabelGroupSet)]
-pub(crate) struct DatabaseQueryLatencyLabelGroup {
-    pub(crate) operation: DatabaseOperation,
-}
-
-#[derive(FixedCardinalityLabel)]
-pub(crate) enum ReconcileOutcome {
-    #[label(rename = "ok")]
-    Success,
-    Error,
-    Cancel,
-}
-
-#[derive(FixedCardinalityLabel, Clone)]
-pub(crate) enum Method {
-    Get,
-    Put,
-    Post,
-    Delete,
-    Other,
-}
-
-impl From<hyper::Method> for Method {
-    fn from(value: hyper::Method) -> Self {
-        if value == hyper::Method::GET {
-            Method::Get
-        } else if value == hyper::Method::PUT {
-            Method::Put
-        } else if value == hyper::Method::POST {
-            Method::Post
-        } else if value == hyper::Method::DELETE {
-            Method::Delete
-        } else {
-            Method::Other
-        }
-    }
-}
-
-pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
-
-impl LabelValue for StatusCode {
-    fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
-        v.write_int(self.0.as_u16() as u64)
-    }
-}
-
-impl FixedCardinalityLabel for StatusCode {
-    fn cardinality() -> usize {
-        (100..1000).len()
-    }
-
-    fn encode(&self) -> usize {
-        self.0.as_u16() as usize
-    }
-
-    fn decode(value: usize) -> Self {
-        Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap())
-    }
-}
-
-#[derive(FixedCardinalityLabel)]
-pub(crate) enum DatabaseErrorLabel {
-    Query,
-    Connection,
-    ConnectionPool,
-    Logical,
-}
-
-impl DatabaseError {
-    pub(crate) fn error_label(&self) -> DatabaseErrorLabel {
-        match self {
-            Self::Query(_) => DatabaseErrorLabel::Query,
-            Self::Connection(_) => DatabaseErrorLabel::Connection,
-            Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
-            Self::Logical(_) => DatabaseErrorLabel::Logical,
-        }
-    }
+    Lazy::force(&RECONCILER);
 }
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -12,9 +12,7 @@ use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use utils::{backoff, id::NodeId};

-use crate::{
-    pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule,
-};
+use crate::{persistence::NodePersistence, scheduler::MaySchedule};

 /// Represents the in-memory description of a Node.
 ///
@@ -204,7 +202,7 @@ impl Node {
        cancel: &CancellationToken,
    ) -> Option<mgmt_api::Result<T>>
    where
-        O: FnMut(PageserverClient) -> F,
+        O: FnMut(mgmt_api::Client) -> F,
        F: std::future::Future<Output = mgmt_api::Result<T>>,
    {
        fn is_fatal(e: &mgmt_api::Error) -> bool {
@@ -226,12 +224,8 @@ impl Node {
                    .build()
                    .expect("Failed to construct HTTP client");

-                let client = PageserverClient::from_client(
-                    self.get_id(),
-                    http_client,
-                    self.base_url(),
-                    jwt.as_deref(),
-                );
+                let client =
+                    mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref());

                let node_cancel_fut = self.cancel.cancelled();

--- a/control_plane/attachment_service/src/pageserver_client.rs
+++ b/control_plane/attachment_service/src/pageserver_client.rs
@@ -1,203 +0,0 @@
-use pageserver_api::{
-    models::{
-        LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
-        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
-    },
-    shard::TenantShardId,
-};
-use pageserver_client::mgmt_api::{Client, Result};
-use reqwest::StatusCode;
-use utils::id::{NodeId, TimelineId};
-
-/// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
-/// controller to collect metrics in a non-intrusive manner.
-#[derive(Debug, Clone)]
-pub(crate) struct PageserverClient {
-    inner: Client,
-    node_id_label: String,
-}
-
-macro_rules! measured_request {
-    ($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{
-        let labels = crate::metrics::PageserverRequestLabelGroup {
-            pageserver_id: $node_id,
-            path: $name,
-            method: $method,
-        };
-
-        let latency = &crate::metrics::METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_pageserver_request_latency;
-        let _timer_guard = latency.start_timer(labels.clone());
-
-        let res = $invoke;
-
-        if res.is_err() {
-            let error_counters = &crate::metrics::METRICS_REGISTRY
-                .metrics_group
-                .storage_controller_pageserver_request_error;
-            error_counters.inc(labels)
-        }
-
-        res
-    }};
-}
-
-impl PageserverClient {
-    pub(crate) fn new(node_id: NodeId, mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
-        Self {
-            inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt),
-            node_id_label: node_id.0.to_string(),
-        }
-    }
-
-    pub(crate) fn from_client(
-        node_id: NodeId,
-        raw_client: reqwest::Client,
-        mgmt_api_endpoint: String,
-        jwt: Option<&str>,
-    ) -> Self {
-        Self {
-            inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt),
-            node_id_label: node_id.0.to_string(),
-        }
-    }
-
-    pub(crate) async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result<StatusCode> {
-        measured_request!(
-            "tenant",
-            crate::metrics::Method::Delete,
-            &self.node_id_label,
-            self.inner.tenant_delete(tenant_shard_id).await
-        )
-    }
-
-    pub(crate) async fn tenant_time_travel_remote_storage(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timestamp: &str,
-        done_if_after: &str,
-    ) -> Result<()> {
-        measured_request!(
-            "tenant_time_travel_remote_storage",
-            crate::metrics::Method::Put,
-            &self.node_id_label,
-            self.inner
-                .tenant_time_travel_remote_storage(tenant_shard_id, timestamp, done_if_after)
-                .await
-        )
-    }
-
-    pub(crate) async fn tenant_secondary_download(
-        &self,
-        tenant_id: TenantShardId,
-        wait: Option<std::time::Duration>,
-    ) -> Result<(StatusCode, SecondaryProgress)> {
-        measured_request!(
-            "tenant_secondary_download",
-            crate::metrics::Method::Post,
-            &self.node_id_label,
-            self.inner.tenant_secondary_download(tenant_id, wait).await
-        )
-    }
-
-    pub(crate) async fn location_config(
-        &self,
-        tenant_shard_id: TenantShardId,
-        config: LocationConfig,
-        flush_ms: Option<std::time::Duration>,
-        lazy: bool,
-    ) -> Result<()> {
-        measured_request!(
-            "location_config",
-            crate::metrics::Method::Put,
-            &self.node_id_label,
-            self.inner
-                .location_config(tenant_shard_id, config, flush_ms, lazy)
-                .await
-        )
-    }
-
-    pub(crate) async fn list_location_config(&self) -> Result<LocationConfigListResponse> {
-        measured_request!(
-            "location_configs",
-            crate::metrics::Method::Get,
-            &self.node_id_label,
-            self.inner.list_location_config().await
-        )
-    }
-
-    pub(crate) async fn get_location_config(
-        &self,
-        tenant_shard_id: TenantShardId,
-    ) -> Result<Option<LocationConfig>> {
-        measured_request!(
-            "location_config",
-            crate::metrics::Method::Get,
-            &self.node_id_label,
-            self.inner.get_location_config(tenant_shard_id).await
-        )
-    }
-
-    pub(crate) async fn timeline_create(
-        &self,
-        tenant_shard_id: TenantShardId,
-        req: &TimelineCreateRequest,
-    ) -> Result<TimelineInfo> {
-        measured_request!(
-            "timeline",
-            crate::metrics::Method::Post,
-            &self.node_id_label,
-            self.inner.timeline_create(tenant_shard_id, req).await
-        )
-    }
-
-    pub(crate) async fn timeline_delete(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-    ) -> Result<StatusCode> {
-        measured_request!(
-            "timeline",
-            crate::metrics::Method::Delete,
-            &self.node_id_label,
-            self.inner
-                .timeline_delete(tenant_shard_id, timeline_id)
-                .await
-        )
-    }
-
-    pub(crate) async fn tenant_shard_split(
-        &self,
-        tenant_shard_id: TenantShardId,
-        req: TenantShardSplitRequest,
-    ) -> Result<TenantShardSplitResponse> {
-        measured_request!(
-            "tenant_shard_split",
-            crate::metrics::Method::Put,
-            &self.node_id_label,
-            self.inner.tenant_shard_split(tenant_shard_id, req).await
-        )
-    }
-
-    pub(crate) async fn timeline_list(
-        &self,
-        tenant_shard_id: &TenantShardId,
-    ) -> Result<Vec<TimelineInfo>> {
-        measured_request!(
-            "timelines",
-            crate::metrics::Method::Get,
-            &self.node_id_label,
-            self.inner.timeline_list(tenant_shard_id).await
-        )
-    }
-
-    pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
-        measured_request!(
-            "utilization",
-            crate::metrics::Method::Get,
-            &self.node_id_label,
-            self.inner.get_utilization().await
-        )
-    }
-}
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -19,9 +19,6 @@ use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};

-use crate::metrics::{
-    DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY,
-};
 use crate::node::Node;

 /// ## What do we store?
@@ -78,25 +75,6 @@ pub(crate) enum DatabaseError {
    Logical(String),
 }

-#[derive(measured::FixedCardinalityLabel, Clone)]
-pub(crate) enum DatabaseOperation {
-    InsertNode,
-    UpdateNode,
-    DeleteNode,
-    ListNodes,
-    BeginShardSplit,
-    CompleteShardSplit,
-    AbortShardSplit,
-    Detach,
-    ReAttach,
-    IncrementGeneration,
-    ListTenantShards,
-    InsertTenantShards,
-    UpdateTenantShard,
-    DeleteTenant,
-    UpdateTenantConfig,
-}
-
 #[must_use]
 pub(crate) enum AbortShardSplitStatus {
    /// We aborted the split in the database by reverting to the parent shards
@@ -137,34 +115,6 @@ impl Persistence {
        }
    }

-    /// Wraps `with_conn` in order to collect latency and error metrics
-    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
-    where
-        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
-        R: Send + 'static,
-    {
-        let latency = &METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_database_query_latency;
-        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup {
-            operation: op.clone(),
-        });
-
-        let res = self.with_conn(func).await;
-
-        if let Err(err) = &res {
-            let error_counter = &METRICS_REGISTRY
-                .metrics_group
-                .storage_controller_database_query_error;
-            error_counter.inc(DatabaseQueryErrorLabelGroup {
-                error_type: err.error_label(),
-                operation: op,
-            })
-        }
-
-        res
-    }
-
    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
    where
@@ -180,27 +130,21 @@ impl Persistence {
    /// When a node is first registered, persist it before using it for anything
    pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
        let np = node.to_persistent();
-        self.with_measured_conn(
-            DatabaseOperation::InsertNode,
-            move |conn| -> DatabaseResult<()> {
-                diesel::insert_into(crate::schema::nodes::table)
-                    .values(&np)
-                    .execute(conn)?;
-                Ok(())
-            },
-        )
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::insert_into(crate::schema::nodes::table)
+                .values(&np)
+                .execute(conn)?;
+            Ok(())
+        })
        .await
    }

    /// At startup, populate the list of nodes which our shards may be placed on
    pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
        let nodes: Vec<NodePersistence> = self
-            .with_measured_conn(
-                DatabaseOperation::ListNodes,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
-                },
-            )
+            .with_conn(move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
+            })
            .await?;

        tracing::info!("list_nodes: loaded {} nodes", nodes.len());
@@ -215,7 +159,7 @@ impl Persistence {
    ) -> DatabaseResult<()> {
        use crate::schema::nodes::dsl::*;
        let updated = self
-            .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
+            .with_conn(move |conn| {
                let updated = diesel::update(nodes)
                    .filter(node_id.eq(input_node_id.0 as i64))
                    .set((scheduling_policy.eq(String::from(input_scheduling)),))
@@ -237,12 +181,9 @@ impl Persistence {
    /// be enriched at runtime with state discovered on pageservers.
    pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
        let loaded = self
-            .with_measured_conn(
-                DatabaseOperation::ListTenantShards,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
-                },
-            )
+            .with_conn(move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
+            })
            .await?;

        if loaded.is_empty() {
@@ -270,10 +211,15 @@ impl Persistence {

        let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
            .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
-        for shard in decoded.tenants.values_mut() {
-            if shard.placement_policy == "\"Single\"" {
-                // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
-                shard.placement_policy = "{\"Attached\":0}".to_string();
+        for (tenant_id, tenant) in &mut decoded.tenants {
+            // Backward compat: an old attachments.json from before PR #6251, replace
+            // empty strings with proper defaults.
+            if tenant.tenant_id.is_empty() {
+                tenant.tenant_id = tenant_id.to_string();
+                tenant.config = serde_json::to_string(&TenantConfig::default())
+                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
+                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single)
+                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
            }
        }

@@ -319,20 +265,17 @@ impl Persistence {
        shards: Vec<TenantShardPersistence>,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::InsertTenantShards,
-            move |conn| -> DatabaseResult<()> {
-                conn.transaction(|conn| -> QueryResult<()> {
-                    for tenant in &shards {
-                        diesel::insert_into(tenant_shards)
-                            .values(tenant)
-                            .execute(conn)?;
-                    }
-                    Ok(())
-                })?;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            conn.transaction(|conn| -> QueryResult<()> {
+                for tenant in &shards {
+                    diesel::insert_into(tenant_shards)
+                        .values(tenant)
+                        .execute(conn)?;
+                }
                Ok(())
-            },
-        )
+            })?;
+            Ok(())
+        })
        .await
    }

@@ -340,31 +283,25 @@ impl Persistence {
    /// the tenant from memory on this server.
    pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::DeleteTenant,
-            move |conn| -> DatabaseResult<()> {
-                diesel::delete(tenant_shards)
-                    .filter(tenant_id.eq(del_tenant_id.to_string()))
-                    .execute(conn)?;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::delete(tenant_shards)
+                .filter(tenant_id.eq(del_tenant_id.to_string()))
+                .execute(conn)?;

-                Ok(())
-            },
-        )
+            Ok(())
+        })
        .await
    }

    pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
        use crate::schema::nodes::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::DeleteNode,
-            move |conn| -> DatabaseResult<()> {
-                diesel::delete(nodes)
-                    .filter(node_id.eq(del_node_id.0 as i64))
-                    .execute(conn)?;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::delete(nodes)
+                .filter(node_id.eq(del_node_id.0 as i64))
+                .execute(conn)?;

-                Ok(())
-            },
-        )
+            Ok(())
+        })
        .await
    }

@@ -378,7 +315,7 @@ impl Persistence {
    ) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
        use crate::schema::tenant_shards::dsl::*;
        let updated = self
-            .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
+            .with_conn(move |conn| {
                let rows_updated = diesel::update(tenant_shards)
                    .filter(generation_pageserver.eq(node_id.0 as i64))
                    .set(generation.eq(generation + 1))
@@ -428,7 +365,7 @@ impl Persistence {
    ) -> anyhow::Result<Generation> {
        use crate::schema::tenant_shards::dsl::*;
        let updated = self
-            .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
+            .with_conn(move |conn| {
                let updated = diesel::update(tenant_shards)
                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -472,7 +409,7 @@ impl Persistence {
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;

-        self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
+        self.with_conn(move |conn| {
            let query = diesel::update(tenant_shards)
                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -513,7 +450,7 @@ impl Persistence {
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;

-        self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| {
+        self.with_conn(move |conn| {
            diesel::update(tenant_shards)
                .filter(tenant_id.eq(input_tenant_id.to_string()))
                .set((config.eq(serde_json::to_string(&input_config).unwrap()),))
@@ -528,7 +465,7 @@ impl Persistence {

    pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
+        self.with_conn(move |conn| {
            let updated = diesel::update(tenant_shards)
                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -558,7 +495,7 @@ impl Persistence {
        parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
+        self.with_conn(move |conn| -> DatabaseResult<()> {
            conn.transaction(|conn| -> DatabaseResult<()> {
                // Mark parent shards as splitting

@@ -622,29 +559,26 @@ impl Persistence {
        old_shard_count: ShardCount,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::CompleteShardSplit,
-            move |conn| -> DatabaseResult<()> {
-                conn.transaction(|conn| -> QueryResult<()> {
-                    // Drop parent shards
-                    diesel::delete(tenant_shards)
-                        .filter(tenant_id.eq(split_tenant_id.to_string()))
-                        .filter(shard_count.eq(old_shard_count.literal() as i32))
-                        .execute(conn)?;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            conn.transaction(|conn| -> QueryResult<()> {
+                // Drop parent shards
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(old_shard_count.literal() as i32))
+                    .execute(conn)?;

-                    // Clear sharding flag
-                    let updated = diesel::update(tenant_shards)
-                        .filter(tenant_id.eq(split_tenant_id.to_string()))
-                        .set((splitting.eq(0),))
-                        .execute(conn)?;
-                    debug_assert!(updated > 0);
-
-                    Ok(())
-                })?;
+                // Clear sharding flag
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .set((splitting.eq(0),))
+                    .execute(conn)?;
+                debug_assert!(updated > 0);

                Ok(())
-            },
-        )
+            })?;
+
+            Ok(())
+        })
        .await
    }

@@ -656,44 +590,40 @@ impl Persistence {
        new_shard_count: ShardCount,
    ) -> DatabaseResult<AbortShardSplitStatus> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::AbortShardSplit,
-            move |conn| -> DatabaseResult<AbortShardSplitStatus> {
-                let aborted =
-                    conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
-                        // Clear the splitting state on parent shards
-                        let updated = diesel::update(tenant_shards)
-                            .filter(tenant_id.eq(split_tenant_id.to_string()))
-                            .filter(shard_count.ne(new_shard_count.literal() as i32))
-                            .set((splitting.eq(0),))
-                            .execute(conn)?;
+        self.with_conn(move |conn| -> DatabaseResult<AbortShardSplitStatus> {
+            let aborted = conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
+                // Clear the splitting state on parent shards
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.ne(new_shard_count.literal() as i32))
+                    .set((splitting.eq(0),))
+                    .execute(conn)?;

-                        // Parent shards are already gone: we cannot abort.
-                        if updated == 0 {
-                            return Ok(AbortShardSplitStatus::Complete);
-                        }
+                // Parent shards are already gone: we cannot abort.
+                if updated == 0 {
+                    return Ok(AbortShardSplitStatus::Complete);
+                }

-                        // Sanity check: if parent shards were present, their cardinality should
-                        // be less than the number of child shards.
-                        if updated >= new_shard_count.count() as usize {
-                            return Err(DatabaseError::Logical(format!(
-                                "Unexpected parent shard count {updated} while aborting split to \
+                // Sanity check: if parent shards were present, their cardinality should
+                // be less than the number of child shards.
+                if updated >= new_shard_count.count() as usize {
+                    return Err(DatabaseError::Logical(format!(
+                        "Unexpected parent shard count {updated} while aborting split to \
                            count {new_shard_count:?} on tenant {split_tenant_id}"
-                            )));
-                        }
+                    )));
+                }

-                        // Erase child shards
-                        diesel::delete(tenant_shards)
-                            .filter(tenant_id.eq(split_tenant_id.to_string()))
-                            .filter(shard_count.eq(new_shard_count.literal() as i32))
-                            .execute(conn)?;
+                // Erase child shards
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(new_shard_count.literal() as i32))
+                    .execute(conn)?;

-                        Ok(AbortShardSplitStatus::Aborted)
-                    })?;
+                Ok(AbortShardSplitStatus::Aborted)
+            })?;

-                Ok(aborted)
-            },
-        )
+            Ok(aborted)
+        })
        .await
    }
 }
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,4 +1,3 @@
-use crate::pageserver_client::PageserverClient;
 use crate::persistence::Persistence;
 use crate::service;
 use hyper::StatusCode;
@@ -118,15 +117,6 @@ impl Reconciler {
        flush_ms: Option<Duration>,
        lazy: bool,
    ) -> Result<(), ReconcileError> {
-        if !node.is_available() && config.mode == LocationConfigMode::Detached {
-            // Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline
-            // will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of
-            // what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`]
-            tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation");
-            self.observed.locations.remove(&node.get_id());
-            return Ok(());
-        }
-
        self.observed
            .locations
            .insert(node.get_id(), ObservedStateLocation { conf: None });
@@ -159,16 +149,9 @@ impl Reconciler {
        };
        tracing::info!("location_config({node}) complete: {:?}", config);

-        match config.mode {
-            LocationConfigMode::Detached => {
-                self.observed.locations.remove(&node.get_id());
-            }
-            _ => {
-                self.observed
-                    .locations
-                    .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
-            }
-        }
+        self.observed
+            .locations
+            .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });

        Ok(())
    }
@@ -260,11 +243,8 @@ impl Reconciler {
        tenant_shard_id: TenantShardId,
        node: &Node,
    ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
-        let client = PageserverClient::new(
-            node.get_id(),
-            node.base_url(),
-            self.service_config.jwt_token.as_deref(),
-        );
+        let client =
+            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());

        let timelines = client.timeline_list(&tenant_shard_id).await?;
        Ok(timelines
@@ -495,7 +475,7 @@ impl Reconciler {
            }
        }

-        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Attached(0), then
+        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Single, then
        // this location will be deleted in the general case reconciliation that runs after this.
        let origin_secondary_conf = build_location_config(
            &self.shard,
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -7,9 +7,7 @@ use std::{
    time::{Duration, Instant},
 };

-use crate::{
-    id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError,
-};
+use crate::{id_lock_map::IdLockMap, persistence::AbortShardSplitStatus};
 use anyhow::Context;
 use control_plane::storage_controller::{
    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
@@ -20,14 +18,12 @@ use hyper::StatusCode;
 use pageserver_api::{
    controller_api::{
        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
-        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
-        TenantDescribeResponseShard, TenantLocateResponse, TenantShardMigrateRequest,
-        TenantShardMigrateResponse, UtilizationScore,
+        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
+        TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
    },
    models::{SecondaryProgress, TenantConfigRequest},
 };

-use crate::pageserver_client::PageserverClient;
 use pageserver_api::{
    models::{
        self, LocationConfig, LocationConfigListResponse, LocationConfigMode,
@@ -204,30 +200,6 @@ enum TenantCreateOrUpdate {
    Update(Vec<ShardUpdate>),
 }

-struct ShardSplitParams {
-    old_shard_count: ShardCount,
-    new_shard_count: ShardCount,
-    new_stripe_size: Option<ShardStripeSize>,
-    targets: Vec<ShardSplitTarget>,
-    policy: PlacementPolicy,
-    config: TenantConfig,
-    shard_ident: ShardIdentity,
-}
-
-// When preparing for a shard split, we may either choose to proceed with the split,
-// or find that the work is already done and return NoOp.
-enum ShardSplitAction {
-    Split(ShardSplitParams),
-    NoOp(TenantShardSplitResponse),
-}
-
-// A parent shard which will be split
-struct ShardSplitTarget {
-    parent_id: TenantShardId,
-    node: Node,
-    child_ids: Vec<TenantShardId>,
-}
-
 /// When we tenant shard split operation fails, we may not be able to clean up immediately, because nodes
 /// might not be available.  We therefore use a queue of abort operations processed in the background.
 struct TenantShardSplitAbort {
@@ -553,11 +525,7 @@ impl Service {
                break;
            }

-            let client = PageserverClient::new(
-                node.get_id(),
-                node.base_url(),
-                self.config.jwt_token.as_deref(),
-            );
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
            match client
                .location_config(
                    tenant_shard_id,
@@ -765,19 +733,7 @@ impl Service {
                tenant.waiter.advance(result.sequence);
            }
            Err(e) => {
-                match e {
-                    ReconcileError::Cancel => {
-                        tracing::info!("Reconciler was cancelled");
-                    }
-                    ReconcileError::Remote(mgmt_api::Error::Cancelled) => {
-                        // This might be due to the reconciler getting cancelled, or it might
-                        // be due to the `Node` being marked offline.
-                        tracing::info!("Reconciler cancelled during pageserver API call");
-                    }
-                    _ => {
-                        tracing::warn!("Reconcile error: {}", e);
-                    }
-                }
+                tracing::warn!("Reconcile error: {}", e);

                // Ordering: populate last_error before advancing error_seq,
                // so that waiters will see the correct error after waiting.
@@ -1101,7 +1057,7 @@ impl Service {
                shard_stripe_size: 0,
                generation: Some(0),
                generation_pageserver: None,
-                placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
+                placement_policy: serde_json::to_string(&PlacementPolicy::Single).unwrap(),
                config: serde_json::to_string(&TenantConfig::default()).unwrap(),
                splitting: SplitState::default(),
            };
@@ -1128,7 +1084,7 @@ impl Service {
                        TenantState::new(
                            attach_req.tenant_shard_id,
                            ShardIdentity::unsharded(),
-                            PlacementPolicy::Attached(0),
+                            PlacementPolicy::Single,
                        ),
                    );
                    tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
@@ -1157,7 +1113,7 @@ impl Service {
                    self.persistence
                        .update_tenant_shard(
                            attach_req.tenant_shard_id,
-                            PlacementPolicy::Attached(0),
+                            PlacementPolicy::Single,
                            conf,
                            None,
                        )
@@ -1182,7 +1138,7 @@ impl Service {

        if let Some(new_generation) = new_generation {
            tenant_state.generation = Some(new_generation);
-            tenant_state.policy = PlacementPolicy::Attached(0);
+            tenant_state.policy = PlacementPolicy::Single;
        } else {
            // This is a detach notification.  We must update placement policy to avoid re-attaching
            // during background scheduling/reconciliation, or during storage controller restart.
@@ -1394,8 +1350,7 @@ impl Service {
            incremented_generations.len()
        );

-        // Apply the updated generation to our in-memory state, and
-        // gather discover secondary locations.
+        // Apply the updated generation to our in-memory state
        let mut locked = self.inner.write().unwrap();
        let (nodes, tenants, scheduler) = locked.parts_mut();

@@ -1403,65 +1358,62 @@ impl Service {
            tenants: Vec::new(),
        };

-        // TODO: cancel/restart any running reconciliation for this tenant, it might be trying
-        // to call location_conf API with an old generation.  Wait for cancellation to complete
-        // before responding to this request.  Requires well implemented CancellationToken logic
-        // all the way to where we call location_conf.  Even then, there can still be a location_conf
-        // request in flight over the network: TODO handle that by making location_conf API refuse
-        // to go backward in generations.
+        for (tenant_shard_id, new_gen) in incremented_generations {
+            response.tenants.push(ReAttachResponseTenant {
+                id: tenant_shard_id,
+                gen: new_gen.into().unwrap(),
+            });
+            // Apply the new generation number to our in-memory state
+            let shard_state = tenants.get_mut(&tenant_shard_id);
+            let Some(shard_state) = shard_state else {
+                // Not fatal.  This edge case requires a re-attach to happen
+                // between inserting a new tenant shard in to the database, and updating our in-memory
+                // state to know about the shard, _and_ that the state inserted to the database referenced
+                // a pageserver.  Should never happen, but handle it rather than panicking, since it should
+                // be harmless.
+                tracing::error!(
+                    "Shard {} is in database for node {} but not in-memory state",
+                    tenant_shard_id,
+                    reattach_req.node_id
+                );
+                continue;
+            };

-        // Scan through all shards, applying updates for ones where we updated generation
-        // and identifying shards that intend to have a secondary location on this node.
-        for (tenant_shard_id, shard) in tenants {
-            if let Some(new_gen) = incremented_generations.get(tenant_shard_id) {
-                let new_gen = *new_gen;
-                response.tenants.push(ReAttachResponseTenant {
-                    id: *tenant_shard_id,
-                    gen: Some(new_gen.into().unwrap()),
-                    // A tenant is only put into multi or stale modes in the middle of a [`Reconciler::live_migrate`]
-                    // execution.  If a pageserver is restarted during that process, then the reconcile pass will
-                    // fail, and start from scratch, so it doesn't make sense for us to try and preserve
-                    // the stale/multi states at this point.
-                    mode: LocationConfigMode::AttachedSingle,
-                });
-
-                shard.generation = std::cmp::max(shard.generation, Some(new_gen));
-                if let Some(observed) = shard.observed.locations.get_mut(&reattach_req.node_id) {
-                    // Why can we update `observed` even though we're not sure our response will be received
-                    // by the pageserver?  Because the pageserver will not proceed with startup until
-                    // it has processed response: if it loses it, we'll see another request and increment
-                    // generation again, avoiding any uncertainty about dirtiness of tenant's state.
-                    if let Some(conf) = observed.conf.as_mut() {
-                        conf.generation = new_gen.into();
-                    }
-                } else {
-                    // This node has no observed state for the shard: perhaps it was offline
-                    // when the pageserver restarted.  Insert a None, so that the Reconciler
-                    // will be prompted to learn the location's state before it makes changes.
-                    shard
-                        .observed
-                        .locations
-                        .insert(reattach_req.node_id, ObservedStateLocation { conf: None });
+            // If [`Persistence::re_attach`] selected this shard, it must have alread
+            // had a generation set.
+            debug_assert!(shard_state.generation.is_some());
+            let Some(old_gen) = shard_state.generation else {
+                // Should never happen:  would only return incremented generation
+                // for a tenant that already had a non-null generation.
+                return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                    "Generation must be set while re-attaching"
+                )));
+            };
+            shard_state.generation = Some(std::cmp::max(old_gen, new_gen));
+            if let Some(observed) = shard_state
+                .observed
+                .locations
+                .get_mut(&reattach_req.node_id)
+            {
+                if let Some(conf) = observed.conf.as_mut() {
+                    conf.generation = new_gen.into();
                }
-            } else if shard.intent.get_secondary().contains(&reattach_req.node_id) {
-                // Ordering: pageserver will not accept /location_config requests until it has
-                // finished processing the response from re-attach.  So we can update our in-memory state
-                // now, and be confident that we are not stamping on the result of some later location config.
-                // TODO: however, we are not strictly ordered wrt ReconcileResults queue,
-                // so we might update observed state here, and then get over-written by some racing
-                // ReconcileResult.  The impact is low however, since we have set state on pageserver something
-                // that matches intent, so worst case if we race then we end up doing a spurious reconcile.
-
-                response.tenants.push(ReAttachResponseTenant {
-                    id: *tenant_shard_id,
-                    gen: None,
-                    mode: LocationConfigMode::Secondary,
-                });
-
-                // We must not update observed, because we have no guarantee that our
-                // response will be received by the pageserver. This could leave it
-                // falsely dirty, but the resulting reconcile should be idempotent.
+            } else {
+                // This node has no observed state for the shard: perhaps it was offline
+                // when the pageserver restarted.  Insert a None, so that the Reconciler
+                // will be prompted to learn the location's state before it makes changes.
+                shard_state
+                    .observed
+                    .locations
+                    .insert(reattach_req.node_id, ObservedStateLocation { conf: None });
            }
+
+            // TODO: cancel/restart any running reconciliation for this tenant, it might be trying
+            // to call location_conf API with an old generation.  Wait for cancellation to complete
+            // before responding to this request.  Requires well implemented CancellationToken logic
+            // all the way to where we call location_conf.  Even then, there can still be a location_conf
+            // request in flight over the network: TODO handle that by making location_conf API refuse
+            // to go backward in generations.
        }

        // We consider a node Active once we have composed a re-attach response, but we
@@ -1539,11 +1491,11 @@ impl Service {
        &self,
        create_req: TenantCreateRequest,
    ) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
+        // As a default, single is convenient for tests that don't choose a policy.
        let placement_policy = create_req
            .placement_policy
            .clone()
-            // As a default, zero secondaries is convenient for tests that don't choose a policy.
-            .unwrap_or(PlacementPolicy::Attached(0));
+            .unwrap_or(PlacementPolicy::Single);

        // This service expects to handle sharding itself: it is an error to try and directly create
        // a particular shard here.
@@ -1753,11 +1705,11 @@ impl Service {
            | LocationConfigMode::AttachedSingle
            | LocationConfigMode::AttachedStale => {
                if nodes.len() > 1 {
-                    PlacementPolicy::Attached(1)
+                    PlacementPolicy::Double(1)
                } else {
                    // Convenience for dev/test: if we just have one pageserver, import
-                    // tenants into non-HA mode so that scheduling will succeed.
-                    PlacementPolicy::Attached(0)
+                    // tenants into Single mode so that scheduling will succeed.
+                    PlacementPolicy::Single
                }
            }
        };
@@ -2106,11 +2058,8 @@ impl Service {
                })
                .collect::<Vec<_>>();
            for tenant_shard_id in shard_ids {
-                let client = PageserverClient::new(
-                    node.get_id(),
-                    node.base_url(),
-                    self.config.jwt_token.as_deref(),
-                );
+                let client =
+                    mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());

                tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",);

@@ -2162,11 +2111,7 @@ impl Service {
        // Issue concurrent requests to all shards' locations
        let mut futs = FuturesUnordered::new();
        for (tenant_shard_id, node) in targets {
-            let client = PageserverClient::new(
-                node.get_id(),
-                node.base_url(),
-                self.config.jwt_token.as_deref(),
-            );
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
            futs.push(async move {
                let result = client
                    .tenant_secondary_download(tenant_shard_id, wait)
@@ -2259,11 +2204,7 @@ impl Service {
        // Phase 1: delete on the pageservers
        let mut any_pending = false;
        for (tenant_shard_id, node) in targets {
-            let client = PageserverClient::new(
-                node.get_id(),
-                node.base_url(),
-                self.config.jwt_token.as_deref(),
-            );
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
            // TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not
            // surface immediately as an error to our caller.
            let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| {
@@ -2375,7 +2316,7 @@ impl Service {
                tenant_shard_id,
                create_req.new_timeline_id,
            );
-            let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
+            let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());

            client
                .timeline_create(tenant_shard_id, &create_req)
@@ -2499,7 +2440,7 @@ impl Service {
                "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
            );

-            let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
+            let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
            client
                .timeline_delete(tenant_shard_id, timeline_id)
                .await
@@ -2540,11 +2481,11 @@ impl Service {
    }

    /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
-    /// function looks up and returns node. If the tenant isn't found, returns Err(ApiError::NotFound)
-    pub(crate) fn tenant_shard0_node(
+    /// function looks it up and returns the url.  If the tenant isn't found, returns Err(ApiError::NotFound)
+    pub(crate) fn tenant_shard0_baseurl(
        &self,
        tenant_id: TenantId,
-    ) -> Result<(Node, TenantShardId), ApiError> {
+    ) -> Result<(String, TenantShardId), ApiError> {
        let locked = self.inner.read().unwrap();
        let Some((tenant_shard_id, shard)) = locked
            .tenants
@@ -2576,7 +2517,7 @@ impl Service {
            )));
        };

-        Ok((node.clone(), *tenant_shard_id))
+        Ok((node.base_url(), *tenant_shard_id))
    }

    pub(crate) fn tenant_locate(
@@ -2586,6 +2527,9 @@ impl Service {
        let locked = self.inner.read().unwrap();
        tracing::info!("Locating shards for tenant {tenant_id}");

+        // Take a snapshot of pageservers
+        let pageservers = locked.nodes.clone();
+
        let mut result = Vec::new();
        let mut shard_params: Option<ShardParameters> = None;

@@ -2599,8 +2543,7 @@ impl Service {
                        "Cannot locate a tenant that is not attached"
                    )))?;

-            let node = locked
-                .nodes
+            let node = pageservers
                .get(&node_id)
                .expect("Pageservers may not be deleted while referenced");

@@ -2648,47 +2591,6 @@ impl Service {
        })
    }

-    pub(crate) fn tenant_describe(
-        &self,
-        tenant_id: TenantId,
-    ) -> Result<TenantDescribeResponse, ApiError> {
-        let locked = self.inner.read().unwrap();
-
-        let mut shard_zero = None;
-        let mut shards = Vec::new();
-
-        for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id))
-        {
-            if tenant_shard_id.is_zero() {
-                shard_zero = Some(shard);
-            }
-
-            let response_shard = TenantDescribeResponseShard {
-                tenant_shard_id: *tenant_shard_id,
-                node_attached: *shard.intent.get_attached(),
-                node_secondary: shard.intent.get_secondary().to_vec(),
-                last_error: shard.last_error.lock().unwrap().clone(),
-                is_reconciling: shard.reconciler.is_some(),
-                is_pending_compute_notification: shard.pending_compute_notification,
-                is_splitting: matches!(shard.splitting, SplitState::Splitting),
-            };
-            shards.push(response_shard);
-        }
-
-        let Some(shard_zero) = shard_zero else {
-            return Err(ApiError::NotFound(
-                anyhow::anyhow!("Tenant {tenant_id} not found").into(),
-            ));
-        };
-
-        Ok(TenantDescribeResponse {
-            shards,
-            stripe_size: shard_zero.shard.stripe_size,
-            policy: shard_zero.policy.clone(),
-            config: shard_zero.config.clone(),
-        })
-    }
-
    #[instrument(skip_all, fields(tenant_id=%op.tenant_id))]
    async fn abort_tenant_shard_split(
        &self,
@@ -2746,7 +2648,7 @@ impl Service {
        let detach_locations: Vec<(Node, TenantShardId)> = {
            let mut detach_locations = Vec::new();
            let mut locked = self.inner.write().unwrap();
-            let (nodes, tenants, scheduler) = locked.parts_mut();
+            let (nodes, tenants, _scheduler) = locked.parts_mut();

            for (tenant_shard_id, shard) in
                tenants.range_mut(TenantShardId::tenant_range(op.tenant_id))
@@ -2779,13 +2681,6 @@ impl Service {

                tracing::info!("Restoring parent shard {tenant_shard_id}");
                shard.splitting = SplitState::Idle;
-                if let Err(e) = shard.schedule(scheduler) {
-                    // If this shard can't be scheduled now (perhaps due to offline nodes or
-                    // capacity issues), that must not prevent us rolling back a split.  In this
-                    // case it should be eventually scheduled in the background.
-                    tracing::warn!("Failed to schedule {tenant_shard_id} during shard abort: {e}")
-                }
-
                self.maybe_reconcile_shard(shard, nodes);
            }

@@ -2877,7 +2772,7 @@ impl Service {
                .map(|(shard_id, _)| *shard_id)
                .collect::<Vec<_>>();

-            let (nodes, tenants, scheduler) = locked.parts_mut();
+            let (_nodes, tenants, scheduler) = locked.parts_mut();
            for parent_id in parent_ids {
                let child_ids = parent_id.split(new_shard_count);

@@ -2919,7 +2814,7 @@ impl Service {
                                generation,
                                &child_shard,
                                &config,
-                                matches!(policy, PlacementPolicy::Attached(n) if n > 0),
+                                matches!(policy, PlacementPolicy::Double(n) if n > 0),
                            )),
                        },
                    );
@@ -2944,8 +2839,6 @@ impl Service {
                        // find a secondary (e.g. because cluster is overloaded).
                        tracing::warn!("Failed to schedule child shard {child}: {e}");
                    }
-                    // In the background, attach secondary locations for the new shards
-                    self.maybe_reconcile_shard(&mut child_state, nodes);

                    tenants.insert(child, child_state);
                    response.new_shards.push(child);
@@ -2968,23 +2861,17 @@ impl Service {
        let new_shard_count = ShardCount::new(split_req.new_shard_count);
        let new_stripe_size = split_req.new_stripe_size;

-        // Validate the request and construct parameters.  This phase is fallible, but does not require
-        // rollback on errors, as it does no I/O and mutates no state.
-        let shard_split_params = match self.prepare_tenant_shard_split(tenant_id, split_req)? {
-            ShardSplitAction::NoOp(resp) => return Ok(resp),
-            ShardSplitAction::Split(params) => params,
-        };
-
-        // Execute this split: this phase mutates state and does remote I/O on pageservers.  If it fails,
-        // we must roll back.
-        let r = self
-            .do_tenant_shard_split(tenant_id, shard_split_params)
-            .await;
+        let r = self.do_tenant_shard_split(tenant_id, split_req).await;

        match r {
            Ok(r) => Ok(r),
+            Err(ApiError::BadRequest(_)) => {
+                // A request validation error does not require rollback: we rejected it before we started making any changes: just
+                // return the error
+                r
+            }
            Err(e) => {
-                // Split might be part-done, we must do work to abort it.
+                // General case error handling: split might be part-done, we must do work to abort it.
                tracing::warn!("Enqueuing background abort of split on {tenant_id}");
                self.abort_tx
                    .send(TenantShardSplitAbort {
@@ -3000,18 +2887,25 @@ impl Service {
        }
    }

-    fn prepare_tenant_shard_split(
+    pub(crate) async fn do_tenant_shard_split(
        &self,
        tenant_id: TenantId,
        split_req: TenantShardSplitRequest,
-    ) -> Result<ShardSplitAction, ApiError> {
+    ) -> Result<TenantShardSplitResponse, ApiError> {
+        let mut policy = None;
+        let mut shard_ident = None;
+
+        // A parent shard which will be split
+        struct SplitTarget {
+            parent_id: TenantShardId,
+            node: Node,
+            child_ids: Vec<TenantShardId>,
+        }
+
        fail::fail_point!("shard-split-validation", |_| Err(ApiError::BadRequest(
            anyhow::anyhow!("failpoint")
        )));

-        let mut policy = None;
-        let mut config = None;
-        let mut shard_ident = None;
        // Validate input, and calculate which shards we will create
        let (old_shard_count, targets) =
            {
@@ -3067,9 +2961,6 @@ impl Service {
                    if shard_ident.is_none() {
                        shard_ident = Some(shard.shard);
                    }
-                    if config.is_none() {
-                        config = Some(shard.config.clone());
-                    }

                    if tenant_shard_id.shard_count.count() == split_req.new_shard_count {
                        tracing::info!(
@@ -3088,7 +2979,9 @@ impl Service {
                        .get(&node_id)
                        .expect("Pageservers may not be deleted while referenced");

-                    targets.push(ShardSplitTarget {
+                    // TODO: if any reconciliation is currently in progress for this shard, wait for it.
+
+                    targets.push(SplitTarget {
                        parent_id: *tenant_shard_id,
                        node: node.clone(),
                        child_ids: tenant_shard_id
@@ -3098,9 +2991,9 @@ impl Service {

                if targets.is_empty() {
                    if children_found.len() == split_req.new_shard_count as usize {
-                        return Ok(ShardSplitAction::NoOp(TenantShardSplitResponse {
+                        return Ok(TenantShardSplitResponse {
                            new_shards: children_found,
-                        }));
+                        });
                    } else {
                        // No shards found to split, and no existing children found: the
                        // tenant doesn't exist at all.
@@ -3130,77 +3023,13 @@ impl Service {
            shard_ident.unwrap()
        };
        let policy = policy.unwrap();
-        let config = config.unwrap();

-        Ok(ShardSplitAction::Split(ShardSplitParams {
-            old_shard_count,
-            new_shard_count: ShardCount::new(split_req.new_shard_count),
-            new_stripe_size: split_req.new_stripe_size,
-            targets,
-            policy,
-            config,
-            shard_ident,
-        }))
-    }
-
-    async fn do_tenant_shard_split(
-        &self,
-        tenant_id: TenantId,
-        params: ShardSplitParams,
-    ) -> Result<TenantShardSplitResponse, ApiError> {
        // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
        // request could occur here, deleting or mutating the tenant.  begin_shard_split checks that the
        // parent shards exist as expected, but it would be neater to do the above pre-checks within the
        // same database transaction rather than pre-check in-memory and then maybe-fail the database write.
        // (https://github.com/neondatabase/neon/issues/6676)

-        let ShardSplitParams {
-            old_shard_count,
-            new_shard_count,
-            new_stripe_size,
-            mut targets,
-            policy,
-            config,
-            shard_ident,
-        } = params;
-
-        // Drop any secondary locations: pageservers do not support splitting these, and in any case the
-        // end-state for a split tenant will usually be to have secondary locations on different nodes.
-        // The reconciliation calls in this block also implicitly cancel+barrier wrt any ongoing reconciliation
-        // at the time of split.
-        let waiters = {
-            let mut locked = self.inner.write().unwrap();
-            let mut waiters = Vec::new();
-            let (nodes, tenants, scheduler) = locked.parts_mut();
-            for target in &mut targets {
-                let Some(shard) = tenants.get_mut(&target.parent_id) else {
-                    // Paranoia check: this shouldn't happen: we have the oplock for this tenant ID.
-                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
-                        "Shard {} not found",
-                        target.parent_id
-                    )));
-                };
-
-                if shard.intent.get_attached() != &Some(target.node.get_id()) {
-                    // Paranoia check: this shouldn't happen: we have the oplock for this tenant ID.
-                    return Err(ApiError::Conflict(format!(
-                        "Shard {} unexpectedly rescheduled during split",
-                        target.parent_id
-                    )));
-                }
-
-                // Irrespective of PlacementPolicy, clear secondary locations from intent
-                shard.intent.clear_secondary(scheduler);
-
-                // Run Reconciler to execute detach fo secondary locations.
-                if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
-                    waiters.push(waiter);
-                }
-            }
-            waiters
-        };
-        self.await_waiters(waiters, RECONCILE_TIMEOUT).await?;
-
        // Before creating any new child shards in memory or on the pageservers, persist them: this
        // enables us to ensure that we will always be able to clean up if something goes wrong.  This also
        // acts as the protection against two concurrent attempts to split: one of them will get a database
@@ -3229,7 +3058,8 @@ impl Service {
                    generation: None,
                    generation_pageserver: Some(target.node.get_id().0 as i64),
                    placement_policy: serde_json::to_string(&policy).unwrap(),
-                    config: serde_json::to_string(&config).unwrap(),
+                    // TODO: get the config out of the map
+                    config: serde_json::to_string(&TenantConfig::default()).unwrap(),
                    splitting: SplitState::Splitting,
                });
            }
@@ -3281,22 +3111,18 @@ impl Service {
        // N>1 shards into M shards -- initially we're usually splitting 1 shard into N).

        for target in &targets {
-            let ShardSplitTarget {
+            let SplitTarget {
                parent_id,
                node,
                child_ids,
            } = target;
-            let client = PageserverClient::new(
-                node.get_id(),
-                node.base_url(),
-                self.config.jwt_token.as_deref(),
-            );
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
            let response = client
                .tenant_shard_split(
                    *parent_id,
                    TenantShardSplitRequest {
-                        new_shard_count: new_shard_count.literal(),
-                        new_stripe_size,
+                        new_shard_count: split_req.new_shard_count,
+                        new_stripe_size: split_req.new_stripe_size,
                    },
                )
                .await
@@ -3345,8 +3171,11 @@ impl Service {
        ));

        // Replace all the shards we just split with their children: this phase is infallible.
-        let (response, child_locations) =
-            self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size);
+        let (response, child_locations) = self.tenant_shard_split_commit_inmem(
+            tenant_id,
+            ShardCount::new(split_req.new_shard_count),
+            split_req.new_stripe_size,
+        );

        // Send compute notifications for all the new shards
        let mut failed_notifications = Vec::new();
@@ -3411,20 +3240,17 @@ impl Service {
                let old_attached = *shard.intent.get_attached();

                match shard.policy {
-                    PlacementPolicy::Attached(n) => {
+                    PlacementPolicy::Single => {
+                        shard.intent.clear_secondary(scheduler);
+                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
+                    }
+                    PlacementPolicy::Double(_n) => {
                        // If our new attached node was a secondary, it no longer should be.
                        shard.intent.remove_secondary(scheduler, migrate_req.node_id);

                        // If we were already attached to something, demote that to a secondary
                        if let Some(old_attached) = old_attached {
-                            if n > 0 {
-                                // Remove other secondaries to make room for the location we'll demote
-                                while shard.intent.get_secondary().len() >= n {
-                                    shard.intent.pop_secondary(scheduler);
-                                }
-
-                                shard.intent.push_secondary(scheduler, old_attached);
-                            }
+                            shard.intent.push_secondary(scheduler, old_attached);
                        }

                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
@@ -3450,7 +3276,7 @@ impl Service {
        if let Some(waiter) = waiter {
            waiter.wait_timeout(RECONCILE_TIMEOUT).await?;
        } else {
-            tracing::info!("Migration is a no-op");
+            tracing::warn!("Migration is a no-op");
        }

        Ok(TenantShardMigrateResponse {})
@@ -3805,13 +3631,6 @@ impl Service {
                        observed_loc.conf = None;
                    }

-                    if new_nodes.len() == 1 {
-                        // Special case for single-node cluster: there is no point trying to reschedule
-                        // any tenant shards: avoid doing so, in order to avoid spewing warnings about
-                        // failures to schedule them.
-                        continue;
-                    }
-
                    if tenant_state.intent.demote_attached(node_id) {
                        tenant_state.sequence = tenant_state.sequence.next();
                        match tenant_state.schedule(scheduler) {
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -4,10 +4,7 @@ use std::{
    time::Duration,
 };

-use crate::{
-    metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
-    persistence::TenantShardPersistence,
-};
+use crate::{metrics, persistence::TenantShardPersistence};
 use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::{
    models::{LocationConfig, LocationConfigMode, TenantConfig},
@@ -460,7 +457,22 @@ impl TenantState {
        // Add/remove nodes to fulfil policy
        use PlacementPolicy::*;
        match self.policy {
-            Attached(secondary_count) => {
+            Single => {
+                // Should have exactly one attached, and zero secondaries
+                if !self.intent.secondary.is_empty() {
+                    self.intent.clear_secondary(scheduler);
+                    modified = true;
+                }
+
+                let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
+                modified |= modified_attached;
+
+                if !self.intent.secondary.is_empty() {
+                    self.intent.clear_secondary(scheduler);
+                    modified = true;
+                }
+            }
+            Double(secondary_count) => {
                let retain_secondaries = if self.intent.attached.is_none()
                    && scheduler.node_preferred(&self.intent.secondary).is_some()
                {
@@ -721,10 +733,7 @@ impl TenantState {
        let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
                                                        tenant_id=%reconciler.tenant_shard_id.tenant_id,
                                                        shard_id=%reconciler.tenant_shard_id.shard_slug());
-        metrics::METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_reconcile_spawn
-            .inc();
+        metrics::RECONCILER.spawned.inc();
        let result_tx = result_tx.clone();
        let join_handle = tokio::task::spawn(
            async move {
@@ -742,12 +751,10 @@ impl TenantState {
                // TODO: wrap all remote API operations in cancellation check
                // as well.
                if reconciler.cancel.is_cancelled() {
-                    metrics::METRICS_REGISTRY
-                        .metrics_group
-                        .storage_controller_reconcile_complete
-                        .inc(ReconcileCompleteLabelGroup {
-                            status: ReconcileOutcome::Cancel,
-                        });
+                    metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
+                        .inc();
                    return;
                }

@@ -762,18 +769,18 @@ impl TenantState {
                }

                // Update result counter
-                let outcome_label = match &result {
-                    Ok(_) => ReconcileOutcome::Success,
-                    Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel,
-                    Err(_) => ReconcileOutcome::Error,
-                };
-
-                metrics::METRICS_REGISTRY
-                    .metrics_group
-                    .storage_controller_reconcile_complete
-                    .inc(ReconcileCompleteLabelGroup {
-                        status: outcome_label,
-                    });
+                match &result {
+                    Ok(_) => metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
+                    Err(ReconcileError::Cancel) => metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
+                    Err(_) => metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
+                }
+                .inc();

                result_tx
                    .send(ReconcileResult {
@@ -888,7 +895,7 @@ pub(crate) mod tests {

        let mut scheduler = Scheduler::new(nodes.values());

-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
        tenant_state
            .schedule(&mut scheduler)
            .expect("we have enough nodes, scheduling should work");
@@ -936,7 +943,7 @@ pub(crate) mod tests {
        let nodes = make_test_nodes(3);
        let mut scheduler = Scheduler::new(nodes.values());

-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));

        tenant_state.observed.locations.insert(
            NodeId(3),
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -437,7 +437,7 @@ async fn handle_tenant(

            let placement_policy = match create_match.get_one::<String>("placement-policy") {
                Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
-                _ => PlacementPolicy::Attached(0),
+                _ => PlacementPolicy::Single,
            };

            let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
@@ -523,6 +523,88 @@ async fn handle_tenant(
                .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
            println!("tenant {tenant_id} successfully configured on the pageserver");
        }
+        Some(("migrate", matches)) => {
+            let tenant_shard_id = get_tenant_shard_id(matches, env)?;
+            let new_pageserver = get_pageserver(env, matches)?;
+            let new_pageserver_id = new_pageserver.conf.id;
+
+            let storage_controller = StorageController::from_env(env);
+            storage_controller
+                .tenant_migrate(tenant_shard_id, new_pageserver_id)
+                .await?;
+
+            println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
+        }
+        Some(("status", matches)) => {
+            let tenant_id = get_tenant_id(matches, env)?;
+
+            let mut shard_table = comfy_table::Table::new();
+            shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
+
+            let mut tenant_synthetic_size = None;
+
+            let storage_controller = StorageController::from_env(env);
+            for shard in storage_controller.tenant_locate(tenant_id).await?.shards {
+                let pageserver =
+                    PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
+
+                let size = pageserver
+                    .http_client
+                    .tenant_details(shard.shard_id)
+                    .await?
+                    .tenant_info
+                    .current_physical_size
+                    .unwrap();
+
+                shard_table.add_row([
+                    format!("{}", shard.shard_id.shard_slug()),
+                    format!("{}", shard.node_id.0),
+                    format!("{} MiB", size / (1024 * 1024)),
+                ]);
+
+                if shard.shard_id.is_zero() {
+                    tenant_synthetic_size =
+                        Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
+                }
+            }
+
+            let Some(synthetic_size) = tenant_synthetic_size else {
+                bail!("Shard 0 not found")
+            };
+
+            let mut tenant_table = comfy_table::Table::new();
+            tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
+            tenant_table.add_row([
+                "Synthetic size".to_string(),
+                format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
+            ]);
+
+            println!("{tenant_table}");
+            println!("{shard_table}");
+        }
+        Some(("shard-split", matches)) => {
+            let tenant_id = get_tenant_id(matches, env)?;
+            let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
+            let shard_stripe_size: Option<ShardStripeSize> = matches
+                .get_one::<Option<ShardStripeSize>>("shard-stripe-size")
+                .cloned()
+                .unwrap();
+
+            let storage_controller = StorageController::from_env(env);
+            let result = storage_controller
+                .tenant_split(tenant_id, shard_count, shard_stripe_size)
+                .await?;
+            println!(
+                "Split tenant {} into shards {}",
+                tenant_id,
+                result
+                    .new_shards
+                    .iter()
+                    .map(|s| format!("{:?}", s))
+                    .collect::<Vec<_>>()
+                    .join(",")
+            );
+        }

        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
        None => bail!("no tenant subcommand provided"),
@@ -1496,6 +1578,19 @@ fn cli() -> Command {
            .subcommand(Command::new("config")
                .arg(tenant_id_arg.clone())
                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
+            .subcommand(Command::new("migrate")
+                .about("Migrate a tenant from one pageserver to another")
+                .arg(tenant_id_arg.clone())
+                .arg(pageserver_id_arg.clone()))
+            .subcommand(Command::new("status")
+                .about("Human readable summary of the tenant's shards and attachment locations")
+                .arg(tenant_id_arg.clone()))
+            .subcommand(Command::new("shard-split")
+                .about("Increase the number of shards in the tenant")
+                .arg(tenant_id_arg.clone())
+                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
+                .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
+                )
        )
        .subcommand(
            Command::new("pageserver")
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -127,8 +127,8 @@ pub struct PageServerConf {
    pub pg_auth_type: AuthType,
    pub http_auth_type: AuthType,

-    pub(crate) virtual_file_io_engine: Option<String>,
-    pub(crate) get_vectored_impl: Option<String>,
+    pub(crate) virtual_file_io_engine: String,
+    pub(crate) get_vectored_impl: String,
 }

 impl Default for PageServerConf {
@@ -139,8 +139,9 @@ impl Default for PageServerConf {
            listen_http_addr: String::new(),
            pg_auth_type: AuthType::Trust,
            http_auth_type: AuthType::Trust,
-            virtual_file_io_engine: None,
-            get_vectored_impl: None,
+            // FIXME: use the ones exposed by pageserver crate
+            virtual_file_io_engine: "tokio-epoll-uring".to_owned(),
+            get_vectored_impl: "sequential".to_owned(),
        }
    }
 }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -101,16 +101,8 @@ impl PageServerNode {

        let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
        let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
-        let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
-            format!("virtual_file_io_engine='{virtual_file_io_engine}'")
-        } else {
-            String::new()
-        };
-        let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
-            format!("get_vectored_impl='{get_vectored_impl}'")
-        } else {
-            String::new()
-        };
+        let virtual_file_io_engine = format!("virtual_file_io_engine='{virtual_file_io_engine}'");
+        let get_vectored_impl = format!("get_vectored_impl='{get_vectored_impl}'");

        let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());

--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -475,7 +475,7 @@ impl StorageController {
    pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
        self.dispatch::<(), _>(
            Method::GET,
-            format!("debug/v1/tenant/{tenant_id}/locate"),
+            format!("control/v1/tenant/{tenant_id}/locate"),
            None,
        )
        .await
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -6,10 +6,7 @@ use std::str::FromStr;
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;

-use crate::{
-    models::{ShardParameters, TenantConfig},
-    shard::{ShardStripeSize, TenantShardId},
-};
+use crate::{models::ShardParameters, shard::TenantShardId};

 #[derive(Serialize, Deserialize)]
 pub struct TenantCreateResponseShard {
@@ -60,31 +57,6 @@ pub struct TenantLocateResponse {
    pub shard_params: ShardParameters,
 }

-#[derive(Serialize, Deserialize)]
-pub struct TenantDescribeResponse {
-    pub shards: Vec<TenantDescribeResponseShard>,
-    pub stripe_size: ShardStripeSize,
-    pub policy: PlacementPolicy,
-    pub config: TenantConfig,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct TenantDescribeResponseShard {
-    pub tenant_shard_id: TenantShardId,
-
-    pub node_attached: Option<NodeId>,
-    pub node_secondary: Vec<NodeId>,
-
-    pub last_error: String,
-
-    /// A task is currently running to reconcile this tenant's intent state with the state on pageservers
-    pub is_reconciling: bool,
-    /// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending.
-    pub is_pending_compute_notification: bool,
-    /// A shard split is currently underway
-    pub is_splitting: bool,
-}
-
 /// Explicitly migrating a particular shard is a low level operation
 /// TODO: higher level "Reschedule tenant" operation where the request
 /// specifies some constraints, e.g. asking it to get off particular node(s)
@@ -209,8 +181,11 @@ impl From<NodeSchedulingPolicy> for String {
 /// to create secondary locations.
 #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
 pub enum PlacementPolicy {
-    /// Normal live state: one attached pageserver and zero or more secondaries.
-    Attached(usize),
+    /// Cheapest way to attach a tenant: just one pageserver, no secondary
+    Single,
+    /// Production-ready way to attach a tenant: one attached pageserver and
+    /// some number of secondaries.
+    Double(usize),
    /// Create one secondary mode locations. This is useful when onboarding
    /// a tenant, or for an idle tenant that we might want to bring online quickly.
    Secondary,
@@ -232,14 +207,14 @@ mod test {
    /// Check stability of PlacementPolicy's serialization
    #[test]
    fn placement_policy_encoding() -> anyhow::Result<()> {
-        let v = PlacementPolicy::Attached(1);
+        let v = PlacementPolicy::Double(1);
        let encoded = serde_json::to_string(&v)?;
-        assert_eq!(encoded, "{\"Attached\":1}");
+        assert_eq!(encoded, "{\"Double\":1}");
        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);

-        let v = PlacementPolicy::Detached;
+        let v = PlacementPolicy::Single;
        let encoded = serde_json::to_string(&v)?;
-        assert_eq!(encoded, "\"Detached\"");
+        assert_eq!(encoded, "\"Single\"");
        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
        Ok(())
    }
--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -6,9 +6,7 @@
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;

-use crate::{
-    controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId,
-};
+use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId};

 /// Upcall message sent by the pageserver to the configured `control_plane_api` on
 /// startup.
@@ -22,20 +20,12 @@ pub struct ReAttachRequest {
    pub register: Option<NodeRegisterRequest>,
 }

-fn default_mode() -> LocationConfigMode {
-    LocationConfigMode::AttachedSingle
-}
-
-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize)]
 pub struct ReAttachResponseTenant {
    pub id: TenantShardId,
-    /// Mandatory if LocationConfigMode is None or set to an Attached* mode
-    pub gen: Option<u32>,
-
-    /// Default value only for backward compat: this field should be set
-    #[serde(default = "default_mode")]
-    pub mode: LocationConfigMode,
+    pub gen: u32,
 }
+
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponse {
    pub tenants: Vec<ReAttachResponseTenant>,
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -198,7 +198,6 @@ impl LocalFs {
            fs::OpenOptions::new()
                .write(true)
                .create(true)
-                .truncate(true)
                .open(&temp_file_path)
                .await
                .with_context(|| {
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -13,7 +13,6 @@ testing = ["fail/failpoints"]
 [dependencies]
 arc-swap.workspace = true
 sentry.workspace = true
-async-compression.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
 bincode.workspace = true
@@ -37,7 +36,6 @@ serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
-tokio-tar.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
@@ -48,7 +46,6 @@ strum.workspace = true
 strum_macros.workspace = true
 url.workspace = true
 uuid.workspace = true
-walkdir.workspace = true

 pq_proto.workspace = true
 postgres_connection.workspace = true
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -245,7 +245,7 @@ impl std::io::Write for ChannelWriter {
    }
 }

-pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    SERVE_METRICS_COUNT.inc();

    let started_at = std::time::Instant::now();
@@ -367,6 +367,7 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
        .middleware(Middleware::post_with_info(
            add_request_id_header_to_response,
        ))
+        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
        .err_handler(route_error_handler)
 }

--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -87,8 +87,6 @@ pub mod failpoint_support;

 pub mod yielding_loop;

-pub mod zstd;
-
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -63,7 +63,6 @@ impl UnwrittenLockFile {
 pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLockFile> {
    let lock_file = fs::OpenOptions::new()
        .create(true) // O_CREAT
-        .truncate(true)
        .write(true)
        .open(lock_file_path)
        .context("open lock file")?;
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -245,7 +245,7 @@ impl<'a, T> Guard<'a, T> {
    ///
    /// The permit will be on a semaphore part of the new internal value, and any following
    /// [`OnceCell::get_or_init`] will wait on it to complete.
-    pub fn take_and_deinit(mut self) -> (T, InitPermit) {
+    pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
        let mut swapped = Inner::default();
        let sem = swapped.init_semaphore.clone();
        // acquire and forget right away, moving the control over to InitPermit
@@ -543,7 +543,7 @@ mod tests {
        target.set(42, permit);

        let (_answer, permit) = {
-            let guard = target
+            let mut guard = target
                .get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
                .await
                .unwrap();
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -1,60 +1,27 @@
 use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};

-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub enum VecMapOrdering {
-    Greater,
-    GreaterOrEqual,
-}
-
 /// Ordered map datastructure implemented in a Vec.
 /// Append only - can only add keys that are larger than the
 /// current max key.
-/// Ordering can be adjusted using [`VecMapOrdering`]
-/// during `VecMap` construction.
 #[derive(Clone, Debug)]
-pub struct VecMap<K, V> {
-    data: Vec<(K, V)>,
-    ordering: VecMapOrdering,
-}
+pub struct VecMap<K, V>(Vec<(K, V)>);

 impl<K, V> Default for VecMap<K, V> {
    fn default() -> Self {
-        VecMap {
-            data: Default::default(),
-            ordering: VecMapOrdering::Greater,
-        }
+        VecMap(Default::default())
    }
 }

-#[derive(thiserror::Error, Debug)]
-pub enum VecMapError {
-    #[error("Key violates ordering constraint")]
-    InvalidKey,
-    #[error("Mismatched ordering constraints")]
-    ExtendOrderingError,
-}
+#[derive(Debug)]
+pub struct InvalidKey;

 impl<K: Ord, V> VecMap<K, V> {
-    pub fn new(ordering: VecMapOrdering) -> Self {
-        Self {
-            data: Vec::new(),
-            ordering,
-        }
-    }
-
-    pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self {
-        Self {
-            data: Vec::with_capacity(capacity),
-            ordering,
-        }
-    }
-
    pub fn is_empty(&self) -> bool {
-        self.data.is_empty()
+        self.0.is_empty()
    }

    pub fn as_slice(&self) -> &[(K, V)] {
-        self.data.as_slice()
+        self.0.as_slice()
    }

    /// This function may panic if given a range where the lower bound is
@@ -62,7 +29,7 @@ impl<K: Ord, V> VecMap<K, V> {
    pub fn slice_range<R: RangeBounds<K>>(&self, range: R) -> &[(K, V)] {
        use std::ops::Bound::*;

-        let binary_search = |k: &K| self.data.binary_search_by_key(&k, extract_key);
+        let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key);

        let start_idx = match range.start_bound() {
            Unbounded => 0,
@@ -74,7 +41,7 @@ impl<K: Ord, V> VecMap<K, V> {
        };

        let end_idx = match range.end_bound() {
-            Unbounded => self.data.len(),
+            Unbounded => self.0.len(),
            Included(k) => match binary_search(k) {
                Ok(idx) => idx + 1,
                Err(idx) => idx,
@@ -82,30 +49,34 @@ impl<K: Ord, V> VecMap<K, V> {
            Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity),
        };

-        &self.data[start_idx..end_idx]
+        &self.0[start_idx..end_idx]
    }

    /// Add a key value pair to the map.
-    /// If `key` is not respective of the `self` ordering the
-    /// pair will not be added and `InvalidKey` error will be returned.
-    pub fn append(&mut self, key: K, value: V) -> Result<usize, VecMapError> {
-        self.validate_key_order(&key)?;
+    /// If `key` is less than or equal to the current maximum key
+    /// the pair will not be added and InvalidKey error will be returned.
+    pub fn append(&mut self, key: K, value: V) -> Result<usize, InvalidKey> {
+        if let Some((last_key, _last_value)) = self.0.last() {
+            if &key <= last_key {
+                return Err(InvalidKey);
+            }
+        }

        let delta_size = self.instrument_vec_op(|vec| vec.push((key, value)));
        Ok(delta_size)
    }

    /// Update the maximum key value pair or add a new key value pair to the map.
-    /// If `key` is not respective of the `self` ordering no updates or additions
-    /// will occur and `InvalidKey` error will be returned.
+    /// If `key` is less than the current maximum key no updates or additions
+    /// will occur and InvalidKey error will be returned.
    pub fn append_or_update_last(
        &mut self,
        key: K,
        mut value: V,
-    ) -> Result<(Option<V>, usize), VecMapError> {
-        if let Some((last_key, last_value)) = self.data.last_mut() {
+    ) -> Result<(Option<V>, usize), InvalidKey> {
+        if let Some((last_key, last_value)) = self.0.last_mut() {
            match key.cmp(last_key) {
-                Ordering::Less => return Err(VecMapError::InvalidKey),
+                Ordering::Less => return Err(InvalidKey),
                Ordering::Equal => {
                    std::mem::swap(last_value, &mut value);
                    const DELTA_SIZE: usize = 0;
@@ -129,67 +100,40 @@ impl<K: Ord, V> VecMap<K, V> {
        V: Clone,
    {
        let split_idx = self
-            .data
+            .0
            .binary_search_by_key(&cutoff, extract_key)
            .unwrap_or_else(std::convert::identity);

        (
-            VecMap {
-                data: self.data[..split_idx].to_vec(),
-                ordering: self.ordering,
-            },
-            VecMap {
-                data: self.data[split_idx..].to_vec(),
-                ordering: self.ordering,
-            },
+            VecMap(self.0[..split_idx].to_vec()),
+            VecMap(self.0[split_idx..].to_vec()),
        )
    }

    /// Move items from `other` to the end of `self`, leaving `other` empty.
-    /// If the `other` ordering is different from `self` ordering
-    /// `ExtendOrderingError` error will be returned.
-    /// If any keys in `other` is not respective of the ordering defined in
-    /// `self`, `InvalidKey` error will be returned and no mutation will occur.
-    pub fn extend(&mut self, other: &mut Self) -> Result<usize, VecMapError> {
-        if self.ordering != other.ordering {
-            return Err(VecMapError::ExtendOrderingError);
-        }
+    /// If any keys in `other` is less than or equal to any key in `self`,
+    /// `InvalidKey` error will be returned and no mutation will occur.
+    pub fn extend(&mut self, other: &mut Self) -> Result<usize, InvalidKey> {
+        let self_last_opt = self.0.last().map(extract_key);
+        let other_first_opt = other.0.last().map(extract_key);

-        let other_first_opt = other.data.last().map(extract_key);
-        if let Some(other_first) = other_first_opt {
-            self.validate_key_order(other_first)?;
-        }
-
-        let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.data));
-        Ok(delta_size)
-    }
-
-    /// Validate the current last key in `self` and key being
-    /// inserted against the order defined in `self`.
-    fn validate_key_order(&self, key: &K) -> Result<(), VecMapError> {
-        if let Some(last_key) = self.data.last().map(extract_key) {
-            match (&self.ordering, &key.cmp(last_key)) {
-                (VecMapOrdering::Greater, Ordering::Less | Ordering::Equal) => {
-                    return Err(VecMapError::InvalidKey);
-                }
-                (VecMapOrdering::Greater, Ordering::Greater) => {}
-                (VecMapOrdering::GreaterOrEqual, Ordering::Less) => {
-                    return Err(VecMapError::InvalidKey);
-                }
-                (VecMapOrdering::GreaterOrEqual, Ordering::Equal | Ordering::Greater) => {}
+        if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) {
+            if self_last >= other_first {
+                return Err(InvalidKey);
            }
        }

-        Ok(())
+        let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.0));
+        Ok(delta_size)
    }

    /// Instrument an operation on the underlying [`Vec`].
    /// Will panic if the operation decreases capacity.
    /// Returns the increase in memory usage caused by the op.
    fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize {
-        let old_cap = self.data.capacity();
-        op(&mut self.data);
-        let new_cap = self.data.capacity();
+        let old_cap = self.0.capacity();
+        op(&mut self.0);
+        let new_cap = self.0.capacity();

        match old_cap.cmp(&new_cap) {
            Ordering::Less => {
@@ -201,36 +145,6 @@ impl<K: Ord, V> VecMap<K, V> {
            Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"),
        }
    }
-
-    /// Similar to `from_iter` defined in `FromIter` trait except
-    /// that it accepts an [`VecMapOrdering`]
-    pub fn from_iter<I: IntoIterator<Item = (K, V)>>(iter: I, ordering: VecMapOrdering) -> Self {
-        let iter = iter.into_iter();
-        let initial_capacity = {
-            match iter.size_hint() {
-                (lower_bound, None) => lower_bound,
-                (_, Some(upper_bound)) => upper_bound,
-            }
-        };
-
-        let mut vec_map = VecMap::with_capacity(initial_capacity, ordering);
-        for (key, value) in iter {
-            vec_map
-                .append(key, value)
-                .expect("The passed collection needs to be sorted!");
-        }
-
-        vec_map
-    }
-}
-
-impl<K: Ord, V> IntoIterator for VecMap<K, V> {
-    type Item = (K, V);
-    type IntoIter = std::vec::IntoIter<(K, V)>;
-
-    fn into_iter(self) -> Self::IntoIter {
-        self.data.into_iter()
-    }
 }

 fn extract_key<K, V>(entry: &(K, V)) -> &K {
@@ -241,7 +155,7 @@ fn extract_key<K, V>(entry: &(K, V)) -> &K {
 mod tests {
    use std::{collections::BTreeMap, ops::Bound};

-    use super::{VecMap, VecMapOrdering};
+    use super::VecMap;

    #[test]
    fn unbounded_range() {
@@ -396,59 +310,5 @@ mod tests {
        left.extend(&mut one_map).unwrap_err();
        assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
        assert_eq!(one_map.as_slice(), &[(1, ())]);
-
-        let mut map_greater_or_equal = VecMap::new(VecMapOrdering::GreaterOrEqual);
-        map_greater_or_equal.append(2, ()).unwrap();
-        map_greater_or_equal.append(2, ()).unwrap();
-
-        left.extend(&mut map_greater_or_equal).unwrap_err();
-        assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
-        assert_eq!(map_greater_or_equal.as_slice(), &[(2, ()), (2, ())]);
-    }
-
-    #[test]
-    fn extend_with_ordering() {
-        let mut left = VecMap::new(VecMapOrdering::GreaterOrEqual);
-        left.append(0, ()).unwrap();
-        assert_eq!(left.as_slice(), &[(0, ())]);
-
-        let mut greater_right = VecMap::new(VecMapOrdering::Greater);
-        greater_right.append(0, ()).unwrap();
-        left.extend(&mut greater_right).unwrap_err();
-        assert_eq!(left.as_slice(), &[(0, ())]);
-
-        let mut greater_or_equal_right = VecMap::new(VecMapOrdering::GreaterOrEqual);
-        greater_or_equal_right.append(2, ()).unwrap();
-        greater_or_equal_right.append(2, ()).unwrap();
-        left.extend(&mut greater_or_equal_right).unwrap();
-        assert_eq!(left.as_slice(), &[(0, ()), (2, ()), (2, ())]);
-    }
-
-    #[test]
-    fn vec_map_from_sorted() {
-        let vec = vec![(1, ()), (2, ()), (3, ()), (6, ())];
-        let vec_map = VecMap::from_iter(vec, VecMapOrdering::Greater);
-        assert_eq!(vec_map.as_slice(), &[(1, ()), (2, ()), (3, ()), (6, ())]);
-
-        let vec = vec![(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())];
-        let vec_map = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
-        assert_eq!(
-            vec_map.as_slice(),
-            &[(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())]
-        );
-    }
-
-    #[test]
-    #[should_panic]
-    fn vec_map_from_unsorted_greater() {
-        let vec = vec![(1, ()), (2, ()), (2, ()), (3, ()), (6, ())];
-        let _ = VecMap::from_iter(vec, VecMapOrdering::Greater);
-    }
-
-    #[test]
-    #[should_panic]
-    fn vec_map_from_unsorted_greater_or_equal() {
-        let vec = vec![(1, ()), (2, ()), (3, ()), (6, ()), (5, ())];
-        let _ = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
    }
 }
--- a/libs/utils/src/zstd.rs
+++ b/libs/utils/src/zstd.rs
@@ -1,78 +0,0 @@
-use std::io::SeekFrom;
-
-use anyhow::{Context, Result};
-use async_compression::{
-    tokio::{bufread::ZstdDecoder, write::ZstdEncoder},
-    zstd::CParameter,
-    Level,
-};
-use camino::Utf8Path;
-use nix::NixPath;
-use tokio::{
-    fs::{File, OpenOptions},
-    io::AsyncBufRead,
-    io::AsyncSeekExt,
-    io::AsyncWriteExt,
-};
-use tokio_tar::{Archive, Builder, HeaderMode};
-use walkdir::WalkDir;
-
-/// Creates a Zstandard tarball.
-pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> {
-    let file = OpenOptions::new()
-        .create(true)
-        .truncate(true)
-        .read(true)
-        .write(true)
-        .open(&tarball)
-        .await
-        .with_context(|| format!("tempfile creation {tarball}"))?;
-
-    let mut paths = Vec::new();
-    for entry in WalkDir::new(path) {
-        let entry = entry?;
-        let metadata = entry.metadata().expect("error getting dir entry metadata");
-        // Also allow directories so that we also get empty directories
-        if !(metadata.is_file() || metadata.is_dir()) {
-            continue;
-        }
-        let path = entry.into_path();
-        paths.push(path);
-    }
-    // Do a sort to get a more consistent listing
-    paths.sort_unstable();
-    let zstd = ZstdEncoder::with_quality_and_params(
-        file,
-        Level::Default,
-        &[CParameter::enable_long_distance_matching(true)],
-    );
-    let mut builder = Builder::new(zstd);
-    // Use reproducible header mode
-    builder.mode(HeaderMode::Deterministic);
-    for p in paths {
-        let rel_path = p.strip_prefix(path)?;
-        if rel_path.is_empty() {
-            // The top directory should not be compressed,
-            // the tar crate doesn't like that
-            continue;
-        }
-        builder.append_path_with_name(&p, rel_path).await?;
-    }
-    let mut zstd = builder.into_inner().await?;
-    zstd.shutdown().await?;
-    let mut compressed = zstd.into_inner();
-    let compressed_len = compressed.metadata().await?.len();
-    compressed.seek(SeekFrom::Start(0)).await?;
-    Ok((compressed, compressed_len))
-}
-
-/// Creates a Zstandard tarball.
-pub async fn extract_zst_tarball(
-    path: &Utf8Path,
-    tarball: impl AsyncBufRead + Unpin,
-) -> Result<()> {
-    let decoder = Box::pin(ZstdDecoder::new(tarball));
-    let mut archive = Archive::new(decoder);
-    archive.unpack(path).await?;
-    Ok(())
-}
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -1,156 +1,160 @@
-//! Quantify a single walredo manager's throughput under N concurrent callers.
+//! Simple benchmarking around walredo.
 //!
-//! The benchmark implementation ([`bench_impl`]) is parametrized by
-//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`]
-//! - `n_redos` => number of times the benchmark shell execute the `redo_work`
-//! - `nclients` => number of clients (more on this shortly).
+//! Right now they hope to just set a baseline. Later we can try to expand into latency and
+//! throughput after figuring out the coordinated omission problems below.
 //!
-//! The benchmark impl sets up a multi-threaded tokio runtime with default parameters.
-//! It spawns `nclients` times [`client`] tokio tasks.
-//! Each task executes the `redo_work` `n_redos/nclients` times.
+//! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by
+//! logging what happens when a sequential scan is requested on a small table, then picking out two
+//! suitable from logs.
 //!
-//! We exercise the following combinations:
-//! - `redo_work = short / medium``
-//! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]`
 //!
-//! We let `criterion` determine the `n_redos` using `iter_custom`.
-//! The idea is that for each `(redo_work, nclients)` combination,
-//! criterion will run the `bench_impl` multiple times with different `n_redos`.
-//! The `bench_impl` reports the aggregate wall clock time from the clients' perspective.
-//! Criterion will divide that by `n_redos` to compute the "time per iteration".
-//! In our case, "time per iteration" means "time per redo_work execution".
-//!
-//! NB: the way by which `iter_custom` determines the "number of iterations"
-//! is called sampling. Apparently the idea here is to detect outliers.
-//! We're not sure whether the current choice of sampling method makes sense.
-//! See https://bheisler.github.io/criterion.rs/book/user_guide/command_line_output.html#collecting-samples
-//!
-//! # Reference Numbers
-//!
-//! 2024-03-20 on i3en.3xlarge
-//!
-//! ```text
-//! short/1                 time:   [26.483 µs 26.614 µs 26.767 µs]
-//! short/2                 time:   [32.223 µs 32.465 µs 32.767 µs]
-//! short/4                 time:   [47.203 µs 47.583 µs 47.984 µs]
-//! short/8                 time:   [89.135 µs 89.612 µs 90.139 µs]
-//! short/16                time:   [190.12 µs 191.52 µs 192.88 µs]
-//! short/32                time:   [380.96 µs 382.63 µs 384.20 µs]
-//! short/64                time:   [736.86 µs 741.07 µs 745.03 µs]
-//! short/128               time:   [1.4106 ms 1.4206 ms 1.4294 ms]
-//! medium/1                time:   [111.81 µs 112.25 µs 112.79 µs]
-//! medium/2                time:   [158.26 µs 159.13 µs 160.21 µs]
-//! medium/4                time:   [334.65 µs 337.14 µs 340.07 µs]
-//! medium/8                time:   [675.32 µs 679.91 µs 685.25 µs]
-//! medium/16               time:   [1.2929 ms 1.2996 ms 1.3067 ms]
-//! medium/32               time:   [2.4295 ms 2.4461 ms 2.4623 ms]
-//! medium/64               time:   [4.3973 ms 4.4458 ms 4.4875 ms]
-//! medium/128              time:   [7.5955 ms 7.7847 ms 7.9481 ms]
-//! ```
+//! Reference data (git blame to see commit) on an i3en.3xlarge
+// ```text
+//! short/short/1           time:   [39.175 µs 39.348 µs 39.536 µs]
+//! short/short/2           time:   [51.227 µs 51.487 µs 51.755 µs]
+//! short/short/4           time:   [76.048 µs 76.362 µs 76.674 µs]
+//! short/short/8           time:   [128.94 µs 129.82 µs 130.74 µs]
+//! short/short/16          time:   [227.84 µs 229.00 µs 230.28 µs]
+//! short/short/32          time:   [455.97 µs 457.81 µs 459.90 µs]
+//! short/short/64          time:   [902.46 µs 904.84 µs 907.32 µs]
+//! short/short/128         time:   [1.7416 ms 1.7487 ms 1.7561 ms]
+//! ``
+
+use std::sync::Arc;

 use bytes::{Buf, Bytes};
-use criterion::{BenchmarkId, Criterion};
-use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
-use pageserver_api::{key::Key, shard::TenantShardId};
-use std::{
-    sync::Arc,
-    time::{Duration, Instant},
+use pageserver::{
+    config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
 };
-use tokio::{sync::Barrier, task::JoinSet};
+use pageserver_api::shard::TenantShardId;
+use tokio::task::JoinSet;
 use utils::{id::TenantId, lsn::Lsn};

-fn bench(c: &mut Criterion) {
-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("short");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::short_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
-        }
-    }
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};

-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("medium");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::medium_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
-        }
-    }
-}
-criterion::criterion_group!(benches, bench);
-criterion::criterion_main!(benches);
+fn redo_scenarios(c: &mut Criterion) {
+    // logging should be enabled when adding more inputs, since walredo will only report malformed
+    // input to the stderr.
+    // utils::logging::init(utils::logging::LogFormat::Plain).unwrap();

-// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
-fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
    let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();

    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
    let conf = Box::leak(Box::new(conf));
    let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());

+    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
+
+    let manager = Arc::new(manager);
+
+    {
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .unwrap();
+        tracing::info!("executing first");
+        rt.block_on(short().execute(&manager)).unwrap();
+        tracing::info!("first executed");
+    }
+
+    let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128];
+
+    let mut group = c.benchmark_group("short");
+    group.sampling_mode(criterion::SamplingMode::Flat);
+
+    for thread_count in thread_counts {
+        group.bench_with_input(
+            BenchmarkId::new("short", thread_count),
+            &thread_count,
+            |b, thread_count| {
+                add_multithreaded_walredo_requesters(b, *thread_count, &manager, short);
+            },
+        );
+    }
+    drop(group);
+
+    let mut group = c.benchmark_group("medium");
+    group.sampling_mode(criterion::SamplingMode::Flat);
+
+    for thread_count in thread_counts {
+        group.bench_with_input(
+            BenchmarkId::new("medium", thread_count),
+            &thread_count,
+            |b, thread_count| {
+                add_multithreaded_walredo_requesters(b, *thread_count, &manager, medium);
+            },
+        );
+    }
+    drop(group);
+}
+
+/// Sets up a multi-threaded tokio runtime with default worker thread count,
+/// then, spawn `requesters` tasks that repeatedly:
+/// - get input from `input_factor()`
+/// - call `manager.request_redo()` with their input
+///
+/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency.
+///
+/// Using tokio's default worker thread count means the results will differ on machines
+/// with different core countrs. We don't care about that, the performance will always
+/// be different on different hardware. To compare performance of different software versions,
+/// use the same hardware.
+fn add_multithreaded_walredo_requesters(
+    b: &mut criterion::Bencher,
+    nrequesters: usize,
+    manager: &Arc<PostgresRedoManager>,
+    input_factory: fn() -> Request,
+) {
+    assert_ne!(nrequesters, 0);
+
    let rt = tokio::runtime::Builder::new_multi_thread()
        .enable_all()
        .build()
        .unwrap();

-    let start = Arc::new(Barrier::new(nclients as usize));
+    let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1));

-    let mut tasks = JoinSet::new();
-
-    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
-    let manager = Arc::new(manager);
-
-    for _ in 0..nclients {
-        rt.block_on(async {
-            tasks.spawn(client(
-                Arc::clone(&manager),
-                Arc::clone(&start),
-                Arc::clone(&redo_work),
-                // divide the amount of work equally among the clients
-                n_redos / nclients,
-            ))
+    let mut requesters = JoinSet::new();
+    for _ in 0..nrequesters {
+        let _entered = rt.enter();
+        let manager = manager.clone();
+        let barrier = barrier.clone();
+        requesters.spawn(async move {
+            loop {
+                let input = input_factory();
+                barrier.wait().await;
+                let page = input.execute(&manager).await.unwrap();
+                assert_eq!(page.remaining(), 8192);
+                barrier.wait().await;
+            }
        });
    }

-    rt.block_on(async move {
-        let mut total_wallclock_time = std::time::Duration::from_millis(0);
-        while let Some(res) = tasks.join_next().await {
-            total_wallclock_time += res.unwrap();
-        }
-        total_wallclock_time
-    })
+    let do_one_iteration = || {
+        rt.block_on(async {
+            barrier.wait().await;
+            // wait for work to complete
+            barrier.wait().await;
+        })
+    };
+
+    b.iter_batched(
+        || {
+            // warmup
+            do_one_iteration();
+        },
+        |()| {
+            // work loop
+            do_one_iteration();
+        },
+        criterion::BatchSize::PerIteration,
+    );
+
+    rt.block_on(requesters.shutdown());
 }

-async fn client(
-    mgr: Arc<PostgresRedoManager>,
-    start: Arc<Barrier>,
-    redo_work: Arc<Request>,
-    n_redos: u64,
-) -> Duration {
-    start.wait().await;
-    let start = Instant::now();
-    for _ in 0..n_redos {
-        let page = redo_work.execute(&mgr).await.unwrap();
-        assert_eq!(page.remaining(), 8192);
-        // The real pageserver will rarely if ever do 2 walredos in a row without
-        // yielding to the executor.
-        tokio::task::yield_now().await;
-    }
-    start.elapsed()
-}
+criterion_group!(benches, redo_scenarios);
+criterion_main!(benches);

 macro_rules! lsn {
    ($input:expr) => {{
@@ -162,46 +166,12 @@ macro_rules! lsn {
    }};
 }

-/// Simple wrapper around `WalRedoManager::request_redo`.
-///
-/// In benchmarks this is cloned around.
-#[derive(Clone)]
-struct Request {
-    key: Key,
-    lsn: Lsn,
-    base_img: Option<(Lsn, Bytes)>,
-    records: Vec<(Lsn, NeonWalRecord)>,
-    pg_version: u32,
-}
-
-impl Request {
-    async fn execute(&self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
-        let Request {
-            key,
-            lsn,
-            base_img,
-            records,
-            pg_version,
-        } = self;
-
-        // TODO: avoid these clones
-        manager
-            .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
-            .await
-    }
-
-    fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
-        let rec = Bytes::from_static(bytes);
-        NeonWalRecord::Postgres { will_init, rec }
-    }
-
-    /// Short payload, 1132 bytes.
-    // pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
-    // for null bytes.
-    #[allow(clippy::octal_escapes)]
-    pub fn short_input() -> Request {
-        let pg_record = Self::pg_record;
-        Request {
+/// Short payload, 1132 bytes.
+// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
+// for null bytes.
+#[allow(clippy::octal_escapes)]
+fn short() -> Request {
+    Request {
        key: Key {
            field1: 0,
            field2: 1663,
@@ -224,14 +194,13 @@ impl Request {
        ],
        pg_version: 14,
    }
-    }
+}

-    /// Medium sized payload, serializes as 26393 bytes.
-    // see [`short`]
-    #[allow(clippy::octal_escapes)]
-    pub fn medium_input() -> Request {
-        let pg_record = Self::pg_record;
-        Request {
+/// Medium sized payload, serializes as 26393 bytes.
+// see [`short`]
+#[allow(clippy::octal_escapes)]
+fn medium() -> Request {
+    Request {
        key: Key {
            field1: 0,
            field2: 1663,
@@ -473,5 +442,37 @@ impl Request {
        ],
        pg_version: 14,
    }
+}
+
+fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
+    let rec = Bytes::from_static(bytes);
+    NeonWalRecord::Postgres { will_init, rec }
+}
+
+/// Simple wrapper around `WalRedoManager::request_redo`.
+///
+/// In benchmarks this is cloned around.
+#[derive(Clone)]
+struct Request {
+    key: Key,
+    lsn: Lsn,
+    base_img: Option<(Lsn, Bytes)>,
+    records: Vec<(Lsn, NeonWalRecord)>,
+    pg_version: u32,
+}
+
+impl Request {
+    async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
+        let Request {
+            key,
+            lsn,
+            base_img,
+            records,
+            pg_version,
+        } = self;
+
+        manager
+            .request_redo(key, lsn, base_img, records, pg_version)
+            .await
    }
 }
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -5,8 +5,7 @@ use pageserver_api::{
    controller_api::NodeRegisterRequest,
    shard::TenantShardId,
    upcall_api::{
-        ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
-        ValidateRequestTenant, ValidateResponse,
+        ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
    },
 };
 use serde::{de::DeserializeOwned, Serialize};
@@ -38,9 +37,7 @@ pub trait ControlPlaneGenerationsApi {
    fn re_attach(
        &self,
        conf: &PageServerConf,
-    ) -> impl Future<
-        Output = Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError>,
-    > + Send;
+    ) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
    fn validate(
        &self,
        tenants: Vec<(TenantShardId, Generation)>,
@@ -121,7 +118,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
    async fn re_attach(
        &self,
        conf: &PageServerConf,
-    ) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
+    ) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
        let re_attach_path = self
            .base_url
            .join("re-attach")
@@ -184,7 +181,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
        Ok(response
            .tenants
            .into_iter()
-            .map(|rart| (rart.id, rart))
+            .map(|t| (t.id, Generation::new(t.gen)))
            .collect::<HashMap<_, _>>())
    }

--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -724,8 +724,8 @@ impl DeletionQueue {
 mod test {
    use camino::Utf8Path;
    use hex_literal::hex;
-    use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant};
-    use std::{io::ErrorKind, time::Duration};
+    use pageserver_api::shard::ShardIndex;
+    use std::io::ErrorKind;
    use tracing::info;

    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
@@ -834,10 +834,9 @@ mod test {
        async fn re_attach(
            &self,
            _conf: &PageServerConf,
-        ) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
+        ) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
            unimplemented!()
        }
-
        async fn validate(
            &self,
            tenants: Vec<(TenantShardId, Generation)>,
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -36,7 +36,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
-use utils::http::endpoint::prometheus_metrics_handler;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -2267,7 +2266,6 @@ pub fn make_router(

    Ok(router
        .data(state)
-        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
        .get("/v1/status", |r| api_handler(r, status_handler))
        .put("/v1/failpoints", |r| {
            testing_api_handler("manage failpoints", r, failpoints_handler)
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -2,20 +2,28 @@
 //! Import data and WAL from a PostgreSQL data directory and WAL segments into
 //! a neon Timeline.
 //!
+use std::io::SeekFrom;
 use std::path::{Path, PathBuf};

 use anyhow::{bail, ensure, Context, Result};
+use async_compression::tokio::bufread::ZstdDecoder;
+use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
 use bytes::Bytes;
 use camino::Utf8Path;
 use futures::StreamExt;
-use tokio::io::{AsyncRead, AsyncReadExt};
+use nix::NixPath;
+use tokio::fs::{File, OpenOptions};
+use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
 use tokio_tar::Archive;
+use tokio_tar::Builder;
+use tokio_tar::HeaderMode;
 use tracing::*;
 use walkdir::WalkDir;

 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
+use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
 use crate::walrecord::DecodedWALRecord;
@@ -625,3 +633,65 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
    reader.read_to_end(&mut buf).await?;
    Ok(Bytes::from(buf))
 }
+
+pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
+    let file = OpenOptions::new()
+        .create(true)
+        .truncate(true)
+        .read(true)
+        .write(true)
+        .open(&tmp_path)
+        .await
+        .with_context(|| format!("tempfile creation {tmp_path}"))?;
+
+    let mut paths = Vec::new();
+    for entry in WalkDir::new(pgdata_path) {
+        let entry = entry?;
+        let metadata = entry.metadata().expect("error getting dir entry metadata");
+        // Also allow directories so that we also get empty directories
+        if !(metadata.is_file() || metadata.is_dir()) {
+            continue;
+        }
+        let path = entry.into_path();
+        paths.push(path);
+    }
+    // Do a sort to get a more consistent listing
+    paths.sort_unstable();
+    let zstd = ZstdEncoder::with_quality_and_params(
+        file,
+        Level::Default,
+        &[CParameter::enable_long_distance_matching(true)],
+    );
+    let mut builder = Builder::new(zstd);
+    // Use reproducible header mode
+    builder.mode(HeaderMode::Deterministic);
+    for path in paths {
+        let rel_path = path.strip_prefix(pgdata_path)?;
+        if rel_path.is_empty() {
+            // The top directory should not be compressed,
+            // the tar crate doesn't like that
+            continue;
+        }
+        builder.append_path_with_name(&path, rel_path).await?;
+    }
+    let mut zstd = builder.into_inner().await?;
+    zstd.shutdown().await?;
+    let mut compressed = zstd.into_inner();
+    let compressed_len = compressed.metadata().await?.len();
+    const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
+    if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
+        warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
+    }
+    compressed.seek(SeekFrom::Start(0)).await?;
+    Ok((compressed, compressed_len))
+}
+
+pub async fn extract_tar_zst(
+    pgdata_path: &Utf8Path,
+    tar_zst: impl AsyncBufRead + Unpin,
+) -> Result<()> {
+    let tar = Box::pin(ZstdDecoder::new(tar_zst));
+    let mut archive = Archive::new(tar);
+    archive.unpack(pgdata_path).await?;
+    Ok(())
+}
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -34,7 +34,6 @@ use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
-use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};

 const MAX_AUX_FILE_DELTAS: usize = 1024;
@@ -1547,13 +1546,12 @@ impl<'a> DatadirModification<'a> {
        if !self.pending_updates.is_empty() {
            // The put_batch call below expects expects the inputs to be sorted by Lsn,
            // so we do that first.
-            let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
-                self.pending_updates
-                    .drain()
-                    .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
-                    .kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
-                VecMapOrdering::GreaterOrEqual,
-            );
+            let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = self
+                .pending_updates
+                .drain()
+                .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
+                .kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
+                .collect();

            writer.put_batch(lsn_ordered_batch, ctx).await?;
        }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -43,8 +43,6 @@ use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
 use utils::timeout::timeout_cancellable;
 use utils::timeout::TimeoutCancellableError;
-use utils::zstd::create_zst_tarball;
-use utils::zstd::extract_zst_tarball;

 use self::config::AttachedLocationConfig;
 use self::config::AttachmentMode;
@@ -202,13 +200,6 @@ pub(super) struct AttachedTenantConf {
 }

 impl AttachedTenantConf {
-    fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self {
-        Self {
-            tenant_conf,
-            location,
-        }
-    }
-
    fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
        match &location_conf.mode {
            LocationMode::Attached(attach_conf) => Ok(Self {
@@ -685,20 +676,9 @@ impl Tenant {
                }

                // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
-                enum BrokenVerbosity {
-                    Error,
-                    Info
-                }
                let make_broken =
-                    |t: &Tenant, err: anyhow::Error, verbosity: BrokenVerbosity| {
-                        match verbosity {
-                            BrokenVerbosity::Info => {
-                                info!("attach cancelled, setting tenant state to Broken: {err}");
-                            },
-                            BrokenVerbosity::Error => {
-                                error!("attach failed, setting tenant state to Broken: {err:?}");
-                            }
-                        }
+                    |t: &Tenant, err: anyhow::Error| {
+                        error!("attach failed, setting tenant state to Broken: {err:?}");
                        t.state.send_modify(|state| {
                            // The Stopping case is for when we have passed control on to DeleteTenantFlow:
                            // if it errors, we will call make_broken when tenant is already in Stopping.
@@ -762,7 +742,7 @@ impl Tenant {
                            // Make the tenant broken so that set_stopping will not hang waiting for it to leave
                            // the Attaching state.  This is an over-reaction (nothing really broke, the tenant is
                            // just shutting down), but ensures progress.
-                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"), BrokenVerbosity::Info);
+                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
                            return Ok(());
                        },
                    )
@@ -784,7 +764,7 @@ impl Tenant {
                        match res {
                            Ok(p) => Some(p),
                            Err(e) => {
-                                make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
+                                make_broken(&tenant_clone, anyhow::anyhow!(e));
                                return Ok(());
                            }
                        }
@@ -808,7 +788,7 @@ impl Tenant {
                    {
                        Ok(should_resume_deletion) => should_resume_deletion,
                        Err(err) => {
-                            make_broken(&tenant_clone, anyhow::anyhow!(err), BrokenVerbosity::Error);
+                            make_broken(&tenant_clone, anyhow::anyhow!(err));
                            return Ok(());
                        }
                    }
@@ -838,7 +818,7 @@ impl Tenant {
                    .await;

                    if let Err(e) = deleted {
-                        make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
+                        make_broken(&tenant_clone, anyhow::anyhow!(e));
                    }

                    return Ok(());
@@ -859,7 +839,7 @@ impl Tenant {
                        tenant_clone.activate(broker_client, None, &ctx);
                    }
                    Err(e) => {
-                        make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
+                        make_broken(&tenant_clone, anyhow::anyhow!(e));
                    }
                }

@@ -3062,13 +3042,8 @@ impl Tenant {
            }
        }

-        let (pgdata_zstd, tar_zst_size) = create_zst_tarball(pgdata_path, &temp_path).await?;
-        const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
-        if tar_zst_size > INITDB_TAR_ZST_WARN_LIMIT {
-            warn!(
-                "compressed {temp_path} size of {tar_zst_size} is above limit {INITDB_TAR_ZST_WARN_LIMIT}."
-            );
-        }
+        let (pgdata_zstd, tar_zst_size) =
+            import_datadir::create_tar_zst(pgdata_path, &temp_path).await?;

        pausable_failpoint!("before-initdb-upload");

@@ -3168,7 +3143,7 @@ impl Tenant {

            let buf_read =
                BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
-            extract_zst_tarball(&pgdata_path, buf_read)
+            import_datadir::extract_tar_zst(&pgdata_path, buf_read)
                .await
                .context("extract initdb tar")?;
        } else {
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -196,17 +196,16 @@ impl LocationConf {
    /// For use when attaching/re-attaching: update the generation stored in this
    /// structure.  If we were in a secondary state, promote to attached (posession
    /// of a fresh generation implies this).
-    pub(crate) fn attach_in_generation(&mut self, mode: AttachmentMode, generation: Generation) {
+    pub(crate) fn attach_in_generation(&mut self, generation: Generation) {
        match &mut self.mode {
            LocationMode::Attached(attach_conf) => {
                attach_conf.generation = generation;
-                attach_conf.attach_mode = mode;
            }
            LocationMode::Secondary(_) => {
                // We are promoted to attached by the control plane's re-attach response
                self.mode = LocationMode::Attached(AttachedLocationConfig {
                    generation,
-                    attach_mode: mode,
+                    attach_mode: AttachmentMode::Single,
                })
            }
        }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -111,7 +111,6 @@ async fn create_local_delete_mark(
    let _ = std::fs::OpenOptions::new()
        .write(true)
        .create(true)
-        .truncate(true)
        .open(&marker_path)
        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;

--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,13 +2,13 @@
 //! page server.

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
+use futures::stream::StreamExt;
 use itertools::Itertools;
 use pageserver_api::key::Key;
-use pageserver_api::models::{LocationConfigMode, ShardParameters};
+use pageserver_api::models::ShardParameters;
 use pageserver_api::shard::{
    ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
 };
-use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::cmp::Ordering;
@@ -125,46 +125,6 @@ pub(crate) enum ShardSelector {
    Page(Key),
 }

-/// A convenience for use with the re_attach ControlPlaneClient function: rather
-/// than the serializable struct, we build this enum that encapsulates
-/// the invariant that attached tenants always have generations.
-///
-/// This represents the subset of a LocationConfig that we receive during re-attach.
-pub(crate) enum TenantStartupMode {
-    Attached((AttachmentMode, Generation)),
-    Secondary,
-}
-
-impl TenantStartupMode {
-    /// Return the generation & mode that should be used when starting
-    /// this tenant.
-    ///
-    /// If this returns None, the re-attach struct is in an invalid state and
-    /// should be ignored in the response.
-    fn from_reattach_tenant(rart: ReAttachResponseTenant) -> Option<Self> {
-        match (rart.mode, rart.gen) {
-            (LocationConfigMode::Detached, _) => None,
-            (LocationConfigMode::Secondary, _) => Some(Self::Secondary),
-            (LocationConfigMode::AttachedMulti, Some(g)) => {
-                Some(Self::Attached((AttachmentMode::Multi, Generation::new(g))))
-            }
-            (LocationConfigMode::AttachedSingle, Some(g)) => {
-                Some(Self::Attached((AttachmentMode::Single, Generation::new(g))))
-            }
-            (LocationConfigMode::AttachedStale, Some(g)) => {
-                Some(Self::Attached((AttachmentMode::Stale, Generation::new(g))))
-            }
-            _ => {
-                tracing::warn!(
-                    "Received invalid re-attach state for tenant {}: {rart:?}",
-                    rart.id
-                );
-                None
-            }
-        }
-    }
-}
-
 impl TenantsMap {
    /// Convenience function for typical usage, where we want to get a `Tenant` object, for
    /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
@@ -311,7 +271,7 @@ pub struct TenantManager {

 fn emergency_generations(
    tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
-) -> HashMap<TenantShardId, TenantStartupMode> {
+) -> HashMap<TenantShardId, Generation> {
    tenant_confs
        .iter()
        .filter_map(|(tid, lc)| {
@@ -319,15 +279,12 @@ fn emergency_generations(
                Ok(lc) => lc,
                Err(_) => return None,
            };
-            Some((
-                *tid,
-                match &lc.mode {
-                    LocationMode::Attached(alc) => {
-                        TenantStartupMode::Attached((alc.attach_mode, alc.generation))
-                    }
-                    LocationMode::Secondary(_) => TenantStartupMode::Secondary,
-                },
-            ))
+            let gen = match &lc.mode {
+                LocationMode::Attached(alc) => Some(alc.generation),
+                LocationMode::Secondary(_) => None,
+            };
+
+            gen.map(|g| (*tid, g))
        })
        .collect()
 }
@@ -337,7 +294,7 @@ async fn init_load_generations(
    tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
    resources: &TenantSharedResources,
    cancel: &CancellationToken,
-) -> anyhow::Result<Option<HashMap<TenantShardId, TenantStartupMode>>> {
+) -> anyhow::Result<Option<HashMap<TenantShardId, Generation>>> {
    let generations = if conf.control_plane_emergency_mode {
        error!(
            "Emergency mode!  Tenants will be attached unsafely using their last known generation"
@@ -347,12 +304,7 @@ async fn init_load_generations(
        info!("Calling control plane API to re-attach tenants");
        // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
        match client.re_attach(conf).await {
-            Ok(tenants) => tenants
-                .into_iter()
-                .flat_map(|(id, rart)| {
-                    TenantStartupMode::from_reattach_tenant(rart).map(|tsm| (id, tsm))
-                })
-                .collect(),
+            Ok(tenants) => tenants,
            Err(RetryForeverError::ShuttingDown) => {
                anyhow::bail!("Shut down while waiting for control plane re-attach response")
            }
@@ -370,17 +322,9 @@ async fn init_load_generations(
    // Must only do this if remote storage is enabled, otherwise deletion queue
    // is not running and channel push will fail.
    if resources.remote_storage.is_some() {
-        let attached_tenants = generations
-            .iter()
-            .flat_map(|(id, start_mode)| {
-                match start_mode {
-                    TenantStartupMode::Attached((_mode, generation)) => Some(generation),
-                    TenantStartupMode::Secondary => None,
-                }
-                .map(|gen| (*id, *gen))
-            })
-            .collect();
-        resources.deletion_queue_client.recover(attached_tenants)?;
+        resources
+            .deletion_queue_client
+            .recover(generations.clone())?;
    }

    Ok(Some(generations))
@@ -546,8 +490,9 @@ pub async fn init_tenant_mgr(
    // Scan local filesystem for attached tenants
    let tenant_configs = init_load_tenant_configs(conf).await?;

-    // Determine which tenants are to be secondary or attached, and in which generation
-    let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
+    // Determine which tenants are to be attached
+    let tenant_generations =
+        init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;

    tracing::info!(
        "Attaching {} tenants at startup, warming up {} at a time",
@@ -577,102 +522,97 @@ pub async fn init_tenant_mgr(
            }
        };

-        // FIXME: if we were attached, and get demoted to secondary on re-attach, we
-        // don't have a place to get a config.
-        // (https://github.com/neondatabase/neon/issues/5377)
-        const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig =
-            SecondaryLocationConfig { warm: true };
-
-        // Update the location config according to the re-attach response
-        if let Some(tenant_modes) = &tenant_modes {
+        let generation = if let Some(generations) = &tenant_generations {
            // We have a generation map: treat it as the authority for whether
            // this tenant is really attached.
-            match tenant_modes.get(&tenant_shard_id) {
-                None => {
-                    info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
-                    if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
-                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                            "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
-                        );
-                    }
-
-                    // We deleted local content: move on to next tenant, don't try and spawn this one.
-                    continue;
-                }
-                Some(TenantStartupMode::Secondary) => {
-                    if !matches!(location_conf.mode, LocationMode::Secondary(_)) {
-                        location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF);
-                    }
-                }
-                Some(TenantStartupMode::Attached((attach_mode, generation))) => {
-                    let old_gen_higher = match &location_conf.mode {
-                        LocationMode::Attached(AttachedLocationConfig {
-                            generation: old_generation,
-                            attach_mode: _attach_mode,
-                        }) => {
-                            if old_generation > generation {
-                                Some(old_generation)
-                            } else {
-                                None
-                            }
-                        }
-                        _ => None,
-                    };
-                    if let Some(old_generation) = old_gen_higher {
+            if let Some(gen) = generations.get(&tenant_shard_id) {
+                if let LocationMode::Attached(attached) = &location_conf.mode {
+                    if attached.generation > *gen {
                        tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                            "Control plane gave decreasing generation ({generation:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
-                            old_generation
+                            "Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
+                            attached.generation
                        );

                        // We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away
                        // local disk content: demote to secondary rather than detaching.
-                        location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF);
-                    } else {
-                        location_conf.attach_in_generation(*attach_mode, *generation);
+                        tenants.insert(
+                            tenant_shard_id,
+                            TenantSlot::Secondary(SecondaryTenant::new(
+                                tenant_shard_id,
+                                location_conf.shard,
+                                location_conf.tenant_conf.clone(),
+                                &SecondaryLocationConfig { warm: false },
+                            )),
+                        );
                    }
                }
+                *gen
+            } else {
+                match &location_conf.mode {
+                    LocationMode::Secondary(secondary_config) => {
+                        // We do not require the control plane's permission for secondary mode
+                        // tenants, because they do no remote writes and hence require no
+                        // generation number
+                        info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
+                        tenants.insert(
+                            tenant_shard_id,
+                            TenantSlot::Secondary(SecondaryTenant::new(
+                                tenant_shard_id,
+                                location_conf.shard,
+                                location_conf.tenant_conf,
+                                secondary_config,
+                            )),
+                        );
+                    }
+                    LocationMode::Attached(_) => {
+                        // TODO: augment re-attach API to enable the control plane to
+                        // instruct us about secondary attachments.  That way, instead of throwing
+                        // away local state, we can gracefully fall back to secondary here, if the control
+                        // plane tells us so.
+                        // (https://github.com/neondatabase/neon/issues/5377)
+                        info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
+                        if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
+                            error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                                "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
+                            );
+                        }
+                    }
+                };
+
+                continue;
            }
        } else {
            // Legacy mode: no generation information, any tenant present
            // on local disk may activate
            info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Starting tenant in legacy mode, no generation",);
+            Generation::none()
        };

        // Presence of a generation number implies attachment: attach the tenant
        // if it wasn't already, and apply the generation number.
+        location_conf.attach_in_generation(generation);
        Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;

        let shard_identity = location_conf.shard;
-        let slot = match location_conf.mode {
-            LocationMode::Attached(attached_conf) => {
-                match tenant_spawn(
-                    conf,
-                    tenant_shard_id,
-                    &tenant_dir_path,
-                    resources.clone(),
-                    AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
-                    shard_identity,
-                    Some(init_order.clone()),
-                    &TENANTS,
-                    SpawnMode::Lazy,
-                    &ctx,
-                ) {
-                    Ok(tenant) => TenantSlot::Attached(tenant),
-                    Err(e) => {
-                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
-                        continue;
-                    }
-                }
+        match tenant_spawn(
+            conf,
+            tenant_shard_id,
+            &tenant_dir_path,
+            resources.clone(),
+            AttachedTenantConf::try_from(location_conf)?,
+            shard_identity,
+            Some(init_order.clone()),
+            &TENANTS,
+            SpawnMode::Lazy,
+            &ctx,
+        ) {
+            Ok(tenant) => {
+                tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant));
            }
-            LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new(
-                tenant_shard_id,
-                shard_identity,
-                location_conf.tenant_conf,
-                &secondary_conf,
-            )),
-        };
-
-        tenants.insert(tenant_shard_id, slot);
+            Err(e) => {
+                error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
+            }
+        }
    }

    info!("Processed {} local tenants at startup", tenants.len());
@@ -1722,9 +1662,9 @@ impl TenantManager {
                    .layers
                    .read()
                    .await
-                    .likely_resident_layers()
-                    .collect::<Vec<_>>();
-
+                    .resident_layers()
+                    .collect::<Vec<_>>()
+                    .await;
                for layer in timeline_layers {
                    let relative_path = layer
                        .local_path()
@@ -2203,7 +2143,7 @@ pub(crate) async fn load_tenant(

    let mut location_conf =
        Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?;
-    location_conf.attach_in_generation(AttachmentMode::Single, generation);
+    location_conf.attach_in_generation(generation);

    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;

--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -15,7 +15,6 @@ use crate::{
    tenant::{
        config::SecondaryLocationConfig,
        debug_assert_current_span_has_tenant_and_timeline_id,
-        ephemeral_file::is_ephemeral_file,
        remote_timeline_client::{
            index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
@@ -535,11 +534,7 @@ impl<'a> TenantDownloader<'a> {
            .await
            .maybe_fatal_err(&context_msg)?;

-        tracing::debug!(
-            "Wrote local heatmap to {}, with {} timelines",
-            heatmap_path,
-            heatmap.timelines.len()
-        );
+        tracing::debug!("Wrote local heatmap to {}", heatmap_path);

        // Clean up any local layers that aren't in the heatmap.  We do this first for all timelines, on the general
        // principle that deletions should be done before writes wherever possible, and so that we can use this
@@ -552,10 +547,6 @@ impl<'a> TenantDownloader<'a> {
        // Download the layers in the heatmap
        for timeline in heatmap.timelines {
            if self.secondary_state.cancel.is_cancelled() {
-                tracing::debug!(
-                    "Cancelled before downloading timeline {}",
-                    timeline.timeline_id
-                );
                return Ok(());
            }

@@ -773,13 +764,10 @@ impl<'a> TenantDownloader<'a> {
            }
        };

-        tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
-
        // Download heatmap layers that are not present on local disk, or update their
        // access time if they are already present.
        for layer in timeline.layers {
            if self.secondary_state.cancel.is_cancelled() {
-                tracing::debug!("Cancelled -- dropping out of layer loop");
                return Ok(());
            }

@@ -962,10 +950,7 @@ async fn init_timeline_state(
            // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
            warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
            continue;
-        } else if crate::is_temporary(&file_path)
-            || is_temp_download_file(&file_path)
-            || is_ephemeral_file(file_name)
-        {
+        } else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) {
            // Temporary files are frequently left behind from restarting during downloads
            tracing::info!("Cleaning up temporary file {file_path}");
            if let Err(e) = tokio::fs::remove_file(&file_path)
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -300,7 +300,6 @@ where

        let tenant_shard_id = job.get_tenant_shard_id();
        let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
-            tracing::info!("Command already running, waiting for it");
            barrier
        } else {
            let running = self.spawn_now(job);
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
--- a/pageserver/src/tenant/storage_layer/layer/failpoints.rs
+++ b/pageserver/src/tenant/storage_layer/layer/failpoints.rs
@@ -1,119 +0,0 @@
-//! failpoints for unit tests, implying `#[cfg(test)]`.
-//!
-//! These are not accessible over http.
-
-use super::*;
-
-impl Layer {
-    /// Enable a failpoint from a unit test.
-    pub(super) fn enable_failpoint(&self, failpoint: Failpoint) {
-        self.0.failpoints.lock().unwrap().push(failpoint);
-    }
-}
-
-impl LayerInner {
-    /// Query if this failpoint is enabled, as in, arrive at a failpoint.
-    ///
-    /// Calls to this method need to be `#[cfg(test)]` guarded.
-    pub(super) async fn failpoint(&self, kind: FailpointKind) -> Result<(), FailpointHit> {
-        let fut = {
-            let mut fps = self.failpoints.lock().unwrap();
-            // find the *last* failpoint for cases in which we need to use multiple for the same
-            // thing (two blocked evictions)
-            let fp = fps.iter_mut().rfind(|x| x.kind() == kind);
-
-            let Some(fp) = fp else {
-                return Ok(());
-            };
-
-            fp.hit()
-        };
-
-        fut.await
-    }
-}
-
-#[derive(Debug, PartialEq, Eq)]
-pub(crate) enum FailpointKind {
-    /// Failpoint acts as an accurate cancelled by drop here; see the only site of use.
-    AfterDeterminingLayerNeedsNoDownload,
-    /// Failpoint for stalling eviction starting
-    WaitBeforeStartingEvicting,
-    /// Failpoint hit in the spawned task
-    WaitBeforeDownloading,
-}
-
-pub(crate) enum Failpoint {
-    AfterDeterminingLayerNeedsNoDownload,
-    WaitBeforeStartingEvicting(
-        Option<utils::completion::Completion>,
-        utils::completion::Barrier,
-    ),
-    WaitBeforeDownloading(
-        Option<utils::completion::Completion>,
-        utils::completion::Barrier,
-    ),
-}
-
-impl Failpoint {
-    fn kind(&self) -> FailpointKind {
-        match self {
-            Failpoint::AfterDeterminingLayerNeedsNoDownload => {
-                FailpointKind::AfterDeterminingLayerNeedsNoDownload
-            }
-            Failpoint::WaitBeforeStartingEvicting(..) => FailpointKind::WaitBeforeStartingEvicting,
-            Failpoint::WaitBeforeDownloading(..) => FailpointKind::WaitBeforeDownloading,
-        }
-    }
-
-    fn hit(&mut self) -> impl std::future::Future<Output = Result<(), FailpointHit>> + 'static {
-        use futures::future::FutureExt;
-
-        // use boxed futures to avoid Either hurdles
-        match self {
-            Failpoint::AfterDeterminingLayerNeedsNoDownload => {
-                let kind = self.kind();
-
-                async move { Err(FailpointHit(kind)) }.boxed()
-            }
-            Failpoint::WaitBeforeStartingEvicting(arrival, b)
-            | Failpoint::WaitBeforeDownloading(arrival, b) => {
-                // first one signals arrival
-                drop(arrival.take());
-
-                let b = b.clone();
-
-                async move {
-                    tracing::trace!("waiting on a failpoint barrier");
-                    b.wait().await;
-                    tracing::trace!("done waiting on a failpoint barrier");
-                    Ok(())
-                }
-                .boxed()
-            }
-        }
-    }
-}
-
-impl std::fmt::Display for FailpointKind {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        std::fmt::Debug::fmt(self, f)
-    }
-}
-
-#[derive(Debug)]
-pub(crate) struct FailpointHit(FailpointKind);
-
-impl std::fmt::Display for FailpointHit {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        std::fmt::Debug::fmt(self, f)
-    }
-}
-
-impl std::error::Error for FailpointHit {}
-
-impl From<FailpointHit> for DownloadError {
-    fn from(value: FailpointHit) -> Self {
-        DownloadError::Failpoint(value.0)
-    }
-}
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1,13 +1,14 @@
+use futures::StreamExt;
 use pageserver_api::key::CONTROLFILE_KEY;
 use tokio::task::JoinSet;
+use tracing::Instrument;
 use utils::{
    completion::{self, Completion},
    id::TimelineId,
 };

-use super::failpoints::{Failpoint, FailpointKind};
 use super::*;
-use crate::context::DownloadBehavior;
+use crate::{context::DownloadBehavior, task_mgr::BACKGROUND_RUNTIME};
 use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};

 /// Used in tests to advance a future to wanted await point, and not futher.
@@ -20,7 +21,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s
 /// Demonstrate the API and resident -> evicted -> resident -> deleted transitions.
 #[tokio::test]
 async fn smoke_test() {
-    let handle = tokio::runtime::Handle::current();
+    let handle = BACKGROUND_RUNTIME.handle();

    let h = TenantHarness::create("smoke_test").unwrap();
    let span = h.span();
@@ -37,7 +38,7 @@ async fn smoke_test() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.resident_layers().collect::<Vec<_>>().await
        };

        assert_eq!(layers.len(), 1);
@@ -87,7 +88,7 @@ async fn smoke_test() {
    //
    // ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to
    // artificially slow it down.
-    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(&handle).await;
+    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;

    match layer
        .evict_and_wait(std::time::Duration::ZERO)
@@ -98,7 +99,7 @@ async fn smoke_test() {
            // expected, but note that the eviction is "still ongoing"
            helper.release().await;
            // exhaust spawn_blocking pool to ensure it is now complete
-            SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle)
+            SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle)
                .await;
        }
        other => unreachable!("{other:?}"),
@@ -107,7 +108,7 @@ async fn smoke_test() {
    // only way to query if a layer is resident is to acquire a ResidentLayer instance.
    // Layer::keep_resident never downloads, but it might initialize if the layer file is found
    // downloaded locally.
-    let none = layer.keep_resident().await;
+    let none = layer.keep_resident().await.unwrap();
    assert!(
        none.is_none(),
        "Expected none, because eviction removed the local file, found: {none:?}"
@@ -166,7 +167,6 @@ async fn smoke_test() {
    rtc.wait_completion().await.unwrap();

    assert_eq!(rtc.get_remote_physical_size(), 0);
-    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
 }

 /// This test demonstrates a previous hang when a eviction and deletion were requested at the same
@@ -174,7 +174,7 @@ async fn smoke_test() {
 #[tokio::test(start_paused = true)]
 async fn evict_and_wait_on_wanted_deleted() {
    // this is the runtime on which Layer spawns the blocking tasks on
-    let handle = tokio::runtime::Handle::current();
+    let handle = BACKGROUND_RUNTIME.handle();

    let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap();
    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
@@ -188,7 +188,7 @@ async fn evict_and_wait_on_wanted_deleted() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.resident_layers().collect::<Vec<_>>().await
        };

        assert_eq!(layers.len(), 1);
@@ -213,11 +213,11 @@ async fn evict_and_wait_on_wanted_deleted() {
        drop(resident);

        // make sure the eviction task gets to run
-        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
+        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;

        let resident = layer.keep_resident().await;
        assert!(
-            resident.is_none(),
+            matches!(resident, Ok(None)),
            "keep_resident should not have re-initialized: {resident:?}"
        );

@@ -235,332 +235,24 @@ async fn evict_and_wait_on_wanted_deleted() {
        layers.finish_gc_timeline(&[layer]);
    }

-    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
+    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;

    assert_eq!(1, LAYER_IMPL_METRICS.started_deletes.get());
    assert_eq!(1, LAYER_IMPL_METRICS.completed_deletes.get());
    assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
    assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
-    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
 }

-/// This test ensures we are able to read the layer while the layer eviction has been
-/// started but not completed.
-#[test]
-fn read_wins_pending_eviction() {
-    let rt = tokio::runtime::Builder::new_current_thread()
-        .max_blocking_threads(1)
-        .enable_all()
-        .start_paused(true)
-        .build()
-        .unwrap();
-
-    rt.block_on(async move {
-        // this is the runtime on which Layer spawns the blocking tasks on
-        let handle = tokio::runtime::Handle::current();
-        let h = TenantHarness::create("read_wins_pending_eviction").unwrap();
-        let (tenant, ctx) = h.load().await;
-        let span = h.span();
-        let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
-
-        let timeline = tenant
-            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
-            .await
-            .unwrap();
-
-        let layer = {
-            let mut layers = {
-                let layers = timeline.layers.read().await;
-                layers.likely_resident_layers().collect::<Vec<_>>()
-            };
-
-            assert_eq!(layers.len(), 1);
-
-            layers.swap_remove(0)
-        };
-
-        // setup done
-
-        let resident = layer.keep_resident().await.unwrap();
-
-        let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
-
-        // drive the future to await on the status channel
-        tokio::time::timeout(ADVANCE, &mut evict_and_wait)
-            .await
-            .expect_err("should had been a timeout since we are holding the layer resident");
-        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
-
-        let (completion, barrier) = utils::completion::channel();
-        let (arrival, arrived_at_barrier) = utils::completion::channel();
-        layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
-            Some(arrival),
-            barrier,
-        ));
-
-        // now the eviction cannot proceed because the threads are consumed while completion exists
-        drop(resident);
-        arrived_at_barrier.wait().await;
-        assert!(!layer.is_likely_resident());
-
-        // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
-        layer
-            .0
-            .get_or_maybe_download(false, None)
-            .instrument(download_span)
-            .await
-            .expect("should had reinitialized without downloading");
-
-        assert!(layer.is_likely_resident());
-
-        // reinitialization notifies of new resident status, which should error out all evict_and_wait
-        let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
-            .await
-            .expect("no timeout, because get_or_maybe_download re-initialized")
-            .expect_err("eviction should not have succeeded because re-initialized");
-
-        // works as intended: evictions lose to "downloads"
-        assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
-        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
-
-        // this is not wrong: the eviction is technically still "on the way" as it's still queued
-        // because of a failpoint
-        assert_eq!(
-            0,
-            LAYER_IMPL_METRICS
-                .cancelled_evictions
-                .values()
-                .map(|ctr| ctr.get())
-                .sum::<u64>()
-        );
-
-        drop(completion);
-
-        tokio::time::sleep(ADVANCE).await;
-        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1)
-            .await;
-
-        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
-
-        // now we finally can observe the original eviction failing
-        // it would had been possible to observe it earlier, but here it is guaranteed to have
-        // happened.
-        assert_eq!(
-            1,
-            LAYER_IMPL_METRICS
-                .cancelled_evictions
-                .values()
-                .map(|ctr| ctr.get())
-                .sum::<u64>()
-        );
-
-        assert_eq!(
-            1,
-            LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::AlreadyReinitialized].get()
-        );
-
-        assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
-    });
-}
-
-/// Use failpoint to delay an eviction starting to get a VersionCheckFailed.
-#[test]
-fn multiple_pending_evictions_in_order() {
-    let name = "multiple_pending_evictions_in_order";
-    let in_order = true;
-    multiple_pending_evictions_scenario(name, in_order);
-}
-
-/// Use failpoint to reorder later eviction before first to get a UnexpectedEvictedState.
-#[test]
-fn multiple_pending_evictions_out_of_order() {
-    let name = "multiple_pending_evictions_out_of_order";
-    let in_order = false;
-    multiple_pending_evictions_scenario(name, in_order);
-}
-
-fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
-    let rt = tokio::runtime::Builder::new_current_thread()
-        .max_blocking_threads(1)
-        .enable_all()
-        .start_paused(true)
-        .build()
-        .unwrap();
-
-    rt.block_on(async move {
-        // this is the runtime on which Layer spawns the blocking tasks on
-        let handle = tokio::runtime::Handle::current();
-        let h = TenantHarness::create(name).unwrap();
-        let (tenant, ctx) = h.load().await;
-        let span = h.span();
-        let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
-
-        let timeline = tenant
-            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
-            .await
-            .unwrap();
-
-        let layer = {
-            let mut layers = {
-                let layers = timeline.layers.read().await;
-                layers.likely_resident_layers().collect::<Vec<_>>()
-            };
-
-            assert_eq!(layers.len(), 1);
-
-            layers.swap_remove(0)
-        };
-
-        // setup done
-
-        let resident = layer.keep_resident().await.unwrap();
-
-        let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
-
-        // drive the future to await on the status channel
-        tokio::time::timeout(ADVANCE, &mut evict_and_wait)
-            .await
-            .expect_err("should had been a timeout since we are holding the layer resident");
-        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
-
-        let (completion1, barrier) = utils::completion::channel();
-        let mut completion1 = Some(completion1);
-        let (arrival, arrived_at_barrier) = utils::completion::channel();
-        layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
-            Some(arrival),
-            barrier,
-        ));
-
-        // now the eviction cannot proceed because we are simulating arbitrary long delay for the
-        // eviction task start.
-        drop(resident);
-        assert!(!layer.is_likely_resident());
-
-        arrived_at_barrier.wait().await;
-
-        // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
-        layer
-            .0
-            .get_or_maybe_download(false, None)
-            .instrument(download_span)
-            .await
-            .expect("should had reinitialized without downloading");
-
-        assert!(layer.is_likely_resident());
-
-        // reinitialization notifies of new resident status, which should error out all evict_and_wait
-        let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
-            .await
-            .expect("no timeout, because get_or_maybe_download re-initialized")
-            .expect_err("eviction should not have succeeded because re-initialized");
-
-        // works as intended: evictions lose to "downloads"
-        assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
-        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
-
-        // this is not wrong: the eviction is technically still "on the way" as it's still queued
-        // because of a failpoint
-        assert_eq!(
-            0,
-            LAYER_IMPL_METRICS
-                .cancelled_evictions
-                .values()
-                .map(|ctr| ctr.get())
-                .sum::<u64>()
-        );
-
-        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
-
-        // configure another failpoint for the second eviction -- evictions are per initialization,
-        // so now that we've reinitialized the inner, we get to run two of them at the same time.
-        let (completion2, barrier) = utils::completion::channel();
-        let (arrival, arrived_at_barrier) = utils::completion::channel();
-        layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
-            Some(arrival),
-            barrier,
-        ));
-
-        let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
-
-        // advance to the wait on the queue
-        tokio::time::timeout(ADVANCE, &mut second_eviction)
-            .await
-            .expect_err("timeout because failpoint is blocking");
-
-        arrived_at_barrier.wait().await;
-
-        assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());
-
-        let mut release_earlier_eviction = |expected_reason| {
-            assert_eq!(
-                0,
-                LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
-            );
-
-            drop(completion1.take().unwrap());
-
-            let handle = &handle;
-
-            async move {
-                tokio::time::sleep(ADVANCE).await;
-                SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(
-                    handle, 1,
-                )
-                .await;
-
-                assert_eq!(
-                    1,
-                    LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
-                );
-            }
-        };
-
-        if in_order {
-            release_earlier_eviction(EvictionCancelled::VersionCheckFailed).await;
-        }
-
-        // release the later eviction which is for the current version
-        drop(completion2);
-        tokio::time::sleep(ADVANCE).await;
-        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1)
-            .await;
-
-        if !in_order {
-            release_earlier_eviction(EvictionCancelled::UnexpectedEvictedState).await;
-        }
-
-        tokio::time::timeout(ADVANCE, &mut second_eviction)
-            .await
-            .expect("eviction goes through now that spawn_blocking is unclogged")
-            .expect("eviction should succeed, because version matches");
-
-        assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
-
-        // ensure the cancelled are unchanged
-        assert_eq!(
-            1,
-            LAYER_IMPL_METRICS
-                .cancelled_evictions
-                .values()
-                .map(|ctr| ctr.get())
-                .sum::<u64>()
-        );
-
-        assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
-    });
-}
-
-/// The test ensures with a failpoint that a pending eviction is not cancelled by what is currently
-/// a `Layer::keep_resident` call.
+/// This test shows that ensures we are able to read the layer while the layer eviction has been
+/// started but not completed due to spawn_blocking pool being blocked.
 ///
-/// This matters because cancelling the eviction would leave us in a state where the file is on
-/// disk but the layer internal state says it has not been initialized. Futhermore, it allows us to
-/// have non-repairing `Layer::is_likely_resident`.
+/// Here `Layer::keep_resident` is used to "simulate" reads, because it cannot download.
 #[tokio::test(start_paused = true)]
-async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
-    let handle = tokio::runtime::Handle::current();
-    let h =
-        TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap();
+async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
+    // this is the runtime on which Layer spawns the blocking tasks on
+    let handle = BACKGROUND_RUNTIME.handle();
+    let h = TenantHarness::create("residency_check_while_evict_and_wait_on_clogged_spawn_blocking")
+        .unwrap();
    let (tenant, ctx) = h.load().await;

    let timeline = tenant
@@ -571,7 +263,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.resident_layers().collect::<Vec<_>>().await
        };

        assert_eq!(layers.len(), 1);
@@ -579,154 +271,90 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
        layers.swap_remove(0)
    };

-    // this failpoint will simulate the `get_or_maybe_download` becoming cancelled (by returning an
-    // Err) at the right time as in "during" the `LayerInner::needs_download`.
-    layer.enable_failpoint(Failpoint::AfterDeterminingLayerNeedsNoDownload);
+    // setup done

-    let (completion, barrier) = utils::completion::channel();
-    let (arrival, arrived_at_barrier) = utils::completion::channel();
-
-    layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
-        Some(arrival),
-        barrier,
-    ));
-
-    tokio::time::timeout(ADVANCE, layer.evict_and_wait(FOREVER))
-        .await
-        .expect_err("should had advanced to waiting on channel");
-
-    arrived_at_barrier.wait().await;
-
-    // simulate a cancelled read which is cancelled before it gets to re-initialize
-    let e = layer
-        .0
-        .get_or_maybe_download(false, None)
-        .await
-        .unwrap_err();
-    assert!(
-        matches!(
-            e,
-            DownloadError::Failpoint(FailpointKind::AfterDeterminingLayerNeedsNoDownload)
-        ),
-        "{e:?}"
-    );
-
-    assert!(
-        layer.0.needs_download().await.unwrap().is_none(),
-        "file is still on disk"
-    );
-
-    // release the eviction task
-    drop(completion);
-    tokio::time::sleep(ADVANCE).await;
-    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
-
-    // failpoint is still enabled, but it is not hit
-    let e = layer
-        .0
-        .get_or_maybe_download(false, None)
-        .await
-        .unwrap_err();
-    assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}");
-
-    // failpoint is not counted as cancellation either
-    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
-}
-
-#[tokio::test(start_paused = true)]
-async fn evict_and_wait_does_not_wait_for_download() {
-    // let handle = tokio::runtime::Handle::current();
-    let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap();
-    let (tenant, ctx) = h.load().await;
-    let span = h.span();
-    let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
-
-    let timeline = tenant
-        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
-        .await
-        .unwrap();
-
-    let layer = {
-        let mut layers = {
-            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
-        };
-
-        assert_eq!(layers.len(), 1);
-
-        layers.swap_remove(0)
-    };
-
-    // kind of forced setup: start an eviction but do not allow it progress until we are
-    // downloading
-    let (eviction_can_continue, barrier) = utils::completion::channel();
-    let (arrival, eviction_arrived) = utils::completion::channel();
-    layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
-        Some(arrival),
-        barrier,
-    ));
+    let resident = layer.keep_resident().await.unwrap();

    let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));

-    // use this once-awaited other_evict to synchronize with the eviction
-    let other_evict = layer.evict_and_wait(FOREVER);
-
+    // drive the future to await on the status channel
    tokio::time::timeout(ADVANCE, &mut evict_and_wait)
        .await
-        .expect_err("should had advanced");
-    eviction_arrived.wait().await;
-    drop(eviction_can_continue);
-    other_evict.await.unwrap();
+        .expect_err("should had been a timeout since we are holding the layer resident");
+    assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());

-    // now the layer is evicted, and the "evict_and_wait" is waiting on the receiver
-    assert!(!layer.is_likely_resident());
+    // clog up BACKGROUND_RUNTIME spawn_blocking
+    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;

-    // following new evict_and_wait will fail until we've completed the download
-    let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
-    assert!(matches!(e, EvictionError::NotFound), "{e:?}");
+    // now the eviction cannot proceed because the threads are consumed while completion exists
+    drop(resident);

-    let (download_can_continue, barrier) = utils::completion::channel();
-    let (arrival, _download_arrived) = utils::completion::channel();
-    layer.enable_failpoint(Failpoint::WaitBeforeDownloading(Some(arrival), barrier));
+    // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
+    layer
+        .keep_resident()
+        .await
+        .expect("keep_resident should had reinitialized without downloading")
+        .expect("ResidentLayer");

-    let mut download = std::pin::pin!(layer
-        .0
-        .get_or_maybe_download(true, None)
-        .instrument(download_span));
+    // because the keep_resident check alters wanted evicted without sending a message, we will
+    // never get completed
+    let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
+        .await
+        .expect("no timeout, because keep_resident re-initialized")
+        .expect_err("eviction should not have succeeded because re-initialized");

-    assert!(
-        !layer.is_likely_resident(),
-        "during download layer is evicted"
+    // works as intended: evictions lose to "downloads"
+    assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
+    assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+
+    // this is not wrong: the eviction is technically still "on the way" as it's still queued
+    // because spawn_blocking is clogged up
+    assert_eq!(
+        0,
+        LAYER_IMPL_METRICS
+            .cancelled_evictions
+            .values()
+            .map(|ctr| ctr.get())
+            .sum::<u64>()
    );

-    tokio::time::timeout(ADVANCE, &mut download)
+    let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
+
+    // advance to the wait on the queue
+    tokio::time::timeout(ADVANCE, &mut second_eviction)
        .await
-        .expect_err("should had timed out because of failpoint");
+        .expect_err("timeout because spawn_blocking is clogged");

-    // now we finally get to continue, and because the latest state is downloading, we deduce that
-    // original eviction succeeded
-    evict_and_wait.await.unwrap();
+    // in this case we don't leak started evictions, but I think there is still a chance of that
+    // happening, because we could have upgrades race multiple evictions while only one of them
+    // happens?
+    assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());

-    // however a new evict_and_wait will fail
-    let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
-    assert!(matches!(e, EvictionError::NotFound), "{e:?}");
+    helper.release().await;

-    assert!(!layer.is_likely_resident());
+    // the second_eviction gets to run here
+    //
+    // synchronize to be *strictly* after the second_eviction spawn_blocking run
+    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;

-    drop(download_can_continue);
-    download.await.expect("download should had succeeded");
-    assert!(layer.is_likely_resident());
+    tokio::time::timeout(ADVANCE, &mut second_eviction)
+        .await
+        .expect("eviction goes through now that spawn_blocking is unclogged")
+        .expect("eviction should succeed, because version matches");

-    // only now can we evict
-    layer.evict_and_wait(FOREVER).await.unwrap();
-}
+    assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());

-#[test]
-fn layer_size() {
-    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
-    assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(std::mem::size_of::<LayerInner>(), 2328);
-    // it also has the utf8 path
+    // now we finally can observe the original spawn_blocking failing
+    // it would had been possible to observe it earlier, but here it is guaranteed to have
+    // happened.
+    assert_eq!(
+        1,
+        LAYER_IMPL_METRICS
+            .cancelled_evictions
+            .values()
+            .map(|ctr| ctr.get())
+            .sum::<u64>()
+    );
 }

 struct SpawnBlockingPoolHelper {
@@ -743,41 +371,31 @@ impl SpawnBlockingPoolHelper {
    ///
    /// This should be no issue nowdays, because nextest runs each test in it's own process.
    async fn consume_all_spawn_blocking_threads(handle: &tokio::runtime::Handle) -> Self {
-        let default_max_blocking_threads = 512;
-
-        Self::consume_all_spawn_blocking_threads0(handle, default_max_blocking_threads).await
-    }
-
-    async fn consume_all_spawn_blocking_threads0(
-        handle: &tokio::runtime::Handle,
-        threads: usize,
-    ) -> Self {
-        assert_ne!(threads, 0);
-
        let (completion, barrier) = completion::channel();
-        let (started, starts_completed) = completion::channel();
+        let (tx, mut rx) = tokio::sync::mpsc::channel(8);
+
+        let assumed_max_blocking_threads = 512;

        let mut blocking_tasks = JoinSet::new();

-        for _ in 0..threads {
+        for _ in 0..assumed_max_blocking_threads {
            let barrier = barrier.clone();
-            let started = started.clone();
+            let tx = tx.clone();
            blocking_tasks.spawn_blocking_on(
                move || {
-                    drop(started);
+                    tx.blocking_send(()).unwrap();
+                    drop(tx);
                    tokio::runtime::Handle::current().block_on(barrier.wait());
                },
                handle,
            );
        }

-        drop(started);
-
-        starts_completed.wait().await;
-
        drop(barrier);

-        tracing::trace!("consumed all threads");
+        for _ in 0..assumed_max_blocking_threads {
+            rx.recv().await.unwrap();
+        }

        SpawnBlockingPoolHelper {
            awaited_by_spawn_blocking_tasks: completion,
@@ -797,22 +415,13 @@ impl SpawnBlockingPoolHelper {
        while let Some(res) = blocking_tasks.join_next().await {
            res.expect("none of the tasks should had panicked");
        }
-
-        tracing::trace!("released all threads");
    }

    /// In the tests it is used as an easy way of making sure something scheduled on the target
    /// runtimes `spawn_blocking` has completed, because it must've been scheduled and completed
    /// before our tasks have a chance to schedule and complete.
    async fn consume_and_release_all_of_spawn_blocking_threads(handle: &tokio::runtime::Handle) {
-        Self::consume_and_release_all_of_spawn_blocking_threads0(handle, 512).await
-    }
-
-    async fn consume_and_release_all_of_spawn_blocking_threads0(
-        handle: &tokio::runtime::Handle,
-        threads: usize,
-    ) {
-        Self::consume_all_spawn_blocking_threads0(handle, threads)
+        Self::consume_all_spawn_blocking_threads(handle)
            .await
            .release()
            .await
@@ -826,7 +435,7 @@ fn spawn_blocking_pool_helper_actually_works() {
    // because the amount is not configurable for our helper, expect the same amount as
    // BACKGROUND_RUNTIME using the tokio defaults would have.
    let rt = tokio::runtime::Builder::new_current_thread()
-        .max_blocking_threads(1)
+        .max_blocking_threads(512)
        .enable_all()
        .build()
        .unwrap();
@@ -836,8 +445,7 @@ fn spawn_blocking_pool_helper_actually_works() {
    rt.block_on(async move {
        // this will not return until all threads are spun up and actually executing the code
        // waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d.
-        let consumed =
-            SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads0(handle, 1).await;
+        let consumed = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;

        println!("consumed");

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -13,6 +13,7 @@ use bytes::Bytes;
 use camino::Utf8Path;
 use enumset::EnumSet;
 use fail::fail_point;
+use futures::stream::StreamExt;
 use once_cell::sync::Lazy;
 use pageserver_api::{
    key::AUX_FILES_KEY,
@@ -36,7 +37,6 @@ use tracing::*;
 use utils::{
    bin_ser::BeSer,
    sync::gate::{Gate, GateGuard},
-    vec_map::VecMap,
 };

 use std::ops::{Deref, Range};
@@ -2442,7 +2442,7 @@ impl Timeline {

        let guard = self.layers.read().await;

-        let resident = guard.likely_resident_layers().map(|layer| {
+        let resident = guard.resident_layers().map(|layer| {
            let last_activity_ts = layer.access_stats().latest_activity_or_now();

            HeatMapLayer::new(
@@ -2452,7 +2452,7 @@ impl Timeline {
            )
        });

-        let layers = resident.collect();
+        let layers = resident.collect().await;

        Some(HeatMapTimeline::new(self.timeline_id, layers))
    }
@@ -4302,7 +4302,7 @@ impl Timeline {
        let mut max_layer_size: Option<u64> = None;

        let resident_layers = guard
-            .likely_resident_layers()
+            .resident_layers()
            .map(|layer| {
                let file_size = layer.layer_desc().file_size;
                max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
@@ -4315,7 +4315,8 @@ impl Timeline {
                    relative_last_activity: finite_f32::FiniteF32::ZERO,
                }
            })
-            .collect();
+            .collect()
+            .await;

        DiskUsageEvictionInfo {
            max_layer_size,
@@ -4617,15 +4618,16 @@ impl<'a> TimelineWriter<'a> {
        }
    }

-    /// Put a batch of keys at the specified Lsns.
+    /// Put a batch keys at the specified Lsns.
    ///
-    /// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`].
+    /// The batch should be sorted by Lsn such that it's safe
+    /// to roll the open layer mid batch.
    pub(crate) async fn put_batch(
        &mut self,
-        batch: VecMap<Lsn, (Key, Value)>,
+        batch: Vec<(Key, Lsn, Value)>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        for (lsn, (key, val)) in batch {
+        for (key, lsn, val) in batch {
            self.put(key, lsn, &val, ctx).await?
        }

@@ -4711,6 +4713,7 @@ mod tests {
            .keep_resident()
            .await
            .expect("no download => no downloading errors")
+            .expect("should had been resident")
            .drop_eviction_guard();

        let forever = std::time::Duration::from_secs(120);
@@ -4721,7 +4724,7 @@ mod tests {
        let (first, second) = tokio::join!(first, second);

        let res = layer.keep_resident().await;
-        assert!(res.is_none(), "{res:?}");
+        assert!(matches!(res, Ok(None)), "{res:?}");

        match (first, second) {
            (Ok(()), Ok(())) => {
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -225,18 +225,24 @@ impl Timeline {
        {
            let guard = self.layers.read().await;
            let layers = guard.layer_map();
-            for layer in layers.iter_historic_layers() {
-                let layer = guard.get_from_desc(&layer);
+            for hist_layer in layers.iter_historic_layers() {
+                let hist_layer = guard.get_from_desc(&hist_layer);

                // guard against eviction while we inspect it; it might be that eviction_task and
                // disk_usage_eviction_task both select the same layers to be evicted, and
                // seemingly free up double the space. both succeeding is of no consequence.
+                let guard = match hist_layer.keep_resident().await {
+                    Ok(Some(l)) => l,
+                    Ok(None) => continue,
+                    Err(e) => {
+                        // these should not happen, but we cannot make them statically impossible right
+                        // now.
+                        tracing::warn!(layer=%hist_layer, "failed to keep the layer resident: {e:#}");
+                        continue;
+                    }
+                };

-                if !layer.is_likely_resident() {
-                    continue;
-                }
-
-                let last_activity_ts = layer.access_stats().latest_activity_or_now();
+                let last_activity_ts = hist_layer.access_stats().latest_activity_or_now();

                let no_activity_for = match now.duration_since(last_activity_ts) {
                    Ok(d) => d,
@@ -259,8 +265,9 @@ impl Timeline {
                        continue;
                    }
                };
-
+                let layer = guard.drop_eviction_guard();
                if no_activity_for > p.threshold {
+                    // this could cause a lot of allocations in some cases
                    js.spawn(async move {
                        layer
                            .evict_and_wait(std::time::Duration::from_secs(5))
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -1,4 +1,5 @@
 use anyhow::{bail, ensure, Context, Result};
+use futures::StreamExt;
 use pageserver_api::shard::TenantShardId;
 use std::{collections::HashMap, sync::Arc};
 use tracing::trace;
@@ -240,16 +241,29 @@ impl LayerManager {
        layer.delete_on_drop();
    }

-    pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = Layer> + '_ {
+    pub(crate) fn resident_layers(&self) -> impl futures::stream::Stream<Item = Layer> + '_ {
        // for small layer maps, we most likely have all resident, but for larger more are likely
        // to be evicted assuming lots of layers correlated with longer lifespan.

-        self.layer_map().iter_historic_layers().filter_map(|desc| {
-            self.layer_fmgr
-                .0
-                .get(&desc.key())
-                .filter(|l| l.is_likely_resident())
-                .cloned()
+        let layers = self
+            .layer_map()
+            .iter_historic_layers()
+            .map(|desc| self.get_from_desc(&desc));
+
+        let layers = futures::stream::iter(layers);
+
+        layers.filter_map(|layer| async move {
+            // TODO(#6028): this query does not really need to see the ResidentLayer
+            match layer.keep_resident().await {
+                Ok(Some(layer)) => Some(layer.drop_eviction_guard()),
+                Ok(None) => None,
+                Err(e) => {
+                    // these should not happen, but we cannot make them statically impossible right
+                    // now.
+                    tracing::warn!(%layer, "failed to keep the layer resident: {e:#}");
+                    None
+                }
+            }
        })
    }

--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -549,10 +549,9 @@ walprop_pg_init_standalone_sync_safekeepers(void)
 static void
 walprop_sigusr2(SIGNAL_ARGS)
 {
-	int			save_errno = errno;
 	got_SIGUSR2 = true;
+
 	SetLatch(MyLatch);
-	errno = save_errno;
 }

 static void
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -59,11 +59,10 @@ rustls.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-sha2 = { workspace = true, features = ["asm"] }
+sha2.workspace = true
 smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
-subtle.workspace = true
 sync_wrapper.workspace = true
 task-local-extensions.workspace = true
 thiserror.workspace = true
@@ -92,7 +91,6 @@ workspace_hack.workspace = true

 [dev-dependencies]
 camino-tempfile.workspace = true
-fallible-iterator.workspace = true
 rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -408,228 +408,3 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
        }
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use std::sync::Arc;
-
-    use bytes::BytesMut;
-    use fallible_iterator::FallibleIterator;
-    use postgres_protocol::{
-        authentication::sasl::{ChannelBinding, ScramSha256},
-        message::{backend::Message as PgMessage, frontend},
-    };
-    use provider::AuthSecret;
-    use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt};
-
-    use crate::{
-        auth::{ComputeUserInfoMaybeEndpoint, IpPattern},
-        config::AuthenticationConfig,
-        console::{
-            self,
-            provider::{self, CachedAllowedIps, CachedRoleSecret},
-            CachedNodeInfo,
-        },
-        context::RequestMonitoring,
-        proxy::NeonOptions,
-        scram::ServerSecret,
-        stream::{PqStream, Stream},
-    };
-
-    use super::auth_quirks;
-
-    struct Auth {
-        ips: Vec<IpPattern>,
-        secret: AuthSecret,
-    }
-
-    impl console::Api for Auth {
-        async fn get_role_secret(
-            &self,
-            _ctx: &mut RequestMonitoring,
-            _user_info: &super::ComputeUserInfo,
-        ) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
-            Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
-        }
-
-        async fn get_allowed_ips_and_secret(
-            &self,
-            _ctx: &mut RequestMonitoring,
-            _user_info: &super::ComputeUserInfo,
-        ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>
-        {
-            Ok((
-                CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())),
-                Some(CachedRoleSecret::new_uncached(Some(self.secret.clone()))),
-            ))
-        }
-
-        async fn wake_compute(
-            &self,
-            _ctx: &mut RequestMonitoring,
-            _user_info: &super::ComputeUserInfo,
-        ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
-            unimplemented!()
-        }
-    }
-
-    static CONFIG: &AuthenticationConfig = &AuthenticationConfig {
-        scram_protocol_timeout: std::time::Duration::from_secs(5),
-    };
-
-    async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
-        loop {
-            r.read_buf(&mut *b).await.unwrap();
-            if let Some(m) = PgMessage::parse(&mut *b).unwrap() {
-                break m;
-            }
-        }
-    }
-
-    #[tokio::test]
-    async fn auth_quirks_scram() {
-        let (mut client, server) = tokio::io::duplex(1024);
-        let mut stream = PqStream::new(Stream::from_raw(server));
-
-        let mut ctx = RequestMonitoring::test();
-        let api = Auth {
-            ips: vec![],
-            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
-        };
-
-        let user_info = ComputeUserInfoMaybeEndpoint {
-            user: "conrad".into(),
-            endpoint_id: Some("endpoint".into()),
-            options: NeonOptions::default(),
-        };
-
-        let handle = tokio::spawn(async move {
-            let mut scram = ScramSha256::new(b"my-secret-password", ChannelBinding::unsupported());
-
-            let mut read = BytesMut::new();
-
-            // server should offer scram
-            match read_message(&mut client, &mut read).await {
-                PgMessage::AuthenticationSasl(a) => {
-                    let options: Vec<&str> = a.mechanisms().collect().unwrap();
-                    assert_eq!(options, ["SCRAM-SHA-256"]);
-                }
-                _ => panic!("wrong message"),
-            }
-
-            // client sends client-first-message
-            let mut write = BytesMut::new();
-            frontend::sasl_initial_response("SCRAM-SHA-256", scram.message(), &mut write).unwrap();
-            client.write_all(&write).await.unwrap();
-
-            // server response with server-first-message
-            match read_message(&mut client, &mut read).await {
-                PgMessage::AuthenticationSaslContinue(a) => {
-                    scram.update(a.data()).await.unwrap();
-                }
-                _ => panic!("wrong message"),
-            }
-
-            // client response with client-final-message
-            write.clear();
-            frontend::sasl_response(scram.message(), &mut write).unwrap();
-            client.write_all(&write).await.unwrap();
-
-            // server response with server-final-message
-            match read_message(&mut client, &mut read).await {
-                PgMessage::AuthenticationSaslFinal(a) => {
-                    scram.finish(a.data()).unwrap();
-                }
-                _ => panic!("wrong message"),
-            }
-        });
-
-        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, CONFIG)
-            .await
-            .unwrap();
-
-        handle.await.unwrap();
-    }
-
-    #[tokio::test]
-    async fn auth_quirks_cleartext() {
-        let (mut client, server) = tokio::io::duplex(1024);
-        let mut stream = PqStream::new(Stream::from_raw(server));
-
-        let mut ctx = RequestMonitoring::test();
-        let api = Auth {
-            ips: vec![],
-            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
-        };
-
-        let user_info = ComputeUserInfoMaybeEndpoint {
-            user: "conrad".into(),
-            endpoint_id: Some("endpoint".into()),
-            options: NeonOptions::default(),
-        };
-
-        let handle = tokio::spawn(async move {
-            let mut read = BytesMut::new();
-            let mut write = BytesMut::new();
-
-            // server should offer cleartext
-            match read_message(&mut client, &mut read).await {
-                PgMessage::AuthenticationCleartextPassword => {}
-                _ => panic!("wrong message"),
-            }
-
-            // client responds with password
-            write.clear();
-            frontend::password_message(b"my-secret-password", &mut write).unwrap();
-            client.write_all(&write).await.unwrap();
-        });
-
-        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, CONFIG)
-            .await
-            .unwrap();
-
-        handle.await.unwrap();
-    }
-
-    #[tokio::test]
-    async fn auth_quirks_password_hack() {
-        let (mut client, server) = tokio::io::duplex(1024);
-        let mut stream = PqStream::new(Stream::from_raw(server));
-
-        let mut ctx = RequestMonitoring::test();
-        let api = Auth {
-            ips: vec![],
-            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
-        };
-
-        let user_info = ComputeUserInfoMaybeEndpoint {
-            user: "conrad".into(),
-            endpoint_id: None,
-            options: NeonOptions::default(),
-        };
-
-        let handle = tokio::spawn(async move {
-            let mut read = BytesMut::new();
-
-            // server should offer cleartext
-            match read_message(&mut client, &mut read).await {
-                PgMessage::AuthenticationCleartextPassword => {}
-                _ => panic!("wrong message"),
-            }
-
-            // client responds with password
-            let mut write = BytesMut::new();
-            frontend::password_message(b"endpoint=my-endpoint;my-secret-password", &mut write)
-                .unwrap();
-            client.write_all(&write).await.unwrap();
-        });
-
-        let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, CONFIG)
-            .await
-            .unwrap();
-
-        assert_eq!(creds.info.endpoint, "my-endpoint");
-
-        handle.await.unwrap();
-    }
-}
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -194,7 +194,14 @@ pub(crate) async fn validate_password_and_exchange(
        }
        // perform scram authentication as both client and server to validate the keys
        AuthSecret::Scram(scram_secret) => {
-            let outcome = crate::scram::exchange(&scram_secret, password).await?;
+            use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
+            let sasl_client = ScramSha256::new(password, ChannelBinding::unsupported());
+            let outcome = crate::scram::exchange(
+                &scram_secret,
+                sasl_client,
+                crate::config::TlsServerEndPoint::Undefined,
+            )
+            .await?;

            let client_key = match outcome {
                sasl::Outcome::Success(client_key) => client_key,
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -82,13 +82,14 @@ pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;
 /// A config for establishing a connection to compute node.
 /// Eventually, `tokio_postgres` will be replaced with something better.
 /// Newtype allows us to implement methods on top of it.
-#[derive(Clone, Default)]
+#[derive(Clone)]
+#[repr(transparent)]
 pub struct ConnCfg(Box<tokio_postgres::Config>);

 /// Creation and initialization routines.
 impl ConnCfg {
    pub fn new() -> Self {
-        Self::default()
+        Self(Default::default())
    }

    /// Reuse password or auth keys from the other config.
@@ -164,6 +165,12 @@ impl std::ops::DerefMut for ConnCfg {
    }
 }

+impl Default for ConnCfg {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl ConnCfg {
    /// Establish a raw TCP connection to the compute node.
    async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> {
--- a/proxy/src/console.rs
+++ b/proxy/src/console.rs
@@ -6,7 +6,7 @@ pub mod messages;

 /// Wrappers for console APIs and their mocks.
 pub mod provider;
-pub(crate) use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo};
+pub use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo};

 /// Various cache-related types.
 pub mod caches {
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -14,6 +14,7 @@ use crate::{
    context::RequestMonitoring,
    scram, EndpointCacheKey, ProjectId,
 };
+use async_trait::async_trait;
 use dashmap::DashMap;
 use std::{sync::Arc, time::Duration};
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
@@ -325,7 +326,8 @@ pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPatt

 /// This will allocate per each call, but the http requests alone
 /// already require a few allocations, so it should be fine.
-pub(crate) trait Api {
+#[async_trait]
+pub trait Api {
    /// Get the client's auth secret for authentication.
    /// Returns option because user not found situation is special.
    /// We still have to mock the scram to avoid leaking information that user doesn't exist.
@@ -361,6 +363,7 @@ pub enum ConsoleBackend {
    Test(Box<dyn crate::auth::backend::TestBackend>),
 }

+#[async_trait]
 impl Api for ConsoleBackend {
    async fn get_role_secret(
        &self,
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -8,6 +8,7 @@ use crate::console::provider::{CachedAllowedIps, CachedRoleSecret};
 use crate::context::RequestMonitoring;
 use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
 use crate::{auth::IpPattern, cache::Cached};
+use async_trait::async_trait;
 use futures::TryFutureExt;
 use std::{str::FromStr, sync::Arc};
 use thiserror::Error;
@@ -143,6 +144,7 @@ async fn get_execute_postgres_query(
    Ok(Some(entry))
 }

+#[async_trait]
 impl super::Api for Api {
    #[tracing::instrument(skip_all)]
    async fn get_role_secret(
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -14,6 +14,7 @@ use crate::{
    context::RequestMonitoring,
    metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER},
 };
+use async_trait::async_trait;
 use futures::TryFutureExt;
 use std::sync::Arc;
 use tokio::time::Instant;
@@ -167,6 +168,7 @@ impl Api {
    }
 }

+#[async_trait]
 impl super::Api for Api {
    #[tracing::instrument(skip_all)]
    async fn get_role_secret(
--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
@@ -2,21 +2,14 @@ use anyhow::{anyhow, bail};
 use hyper::{Body, Request, Response, StatusCode};
 use std::{convert::Infallible, net::TcpListener};
 use tracing::info;
-use utils::http::{
-    endpoint::{self, prometheus_metrics_handler, request_span},
-    error::ApiError,
-    json::json_response,
-    RouterBuilder, RouterService,
-};
+use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService};

 async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(StatusCode::OK, "")
 }

 fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
-    endpoint::make_router()
-        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
-        .get("/v1/status", status_handler)
+    endpoint::make_router().get("/v1/status", status_handler)
 }

 pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible> {
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -135,10 +135,9 @@ impl TestAuth for NoAuth {}
 struct Scram(scram::ServerSecret);

 impl Scram {
-    async fn new(password: &str) -> anyhow::Result<Self> {
-        let secret = scram::ServerSecret::build(password)
-            .await
-            .context("failed to generate scram secret")?;
+    fn new(password: &str) -> anyhow::Result<Self> {
+        let secret =
+            scram::ServerSecret::build(password).context("failed to generate scram secret")?;
        Ok(Scram(secret))
    }

@@ -285,7 +284,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
    let proxy = tokio::spawn(dummy_proxy(
        client,
        Some(server_config),
-        Scram::new(password).await?,
+        Scram::new(password)?,
    ));

    let (_client, _conn) = tokio_postgres::Config::new()
@@ -309,7 +308,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
    let proxy = tokio::spawn(dummy_proxy(
        client,
        Some(server_config),
-        Scram::new("password").await?,
+        Scram::new("password")?,
    ));

    let (_client, _conn) = tokio_postgres::Config::new()
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -148,7 +148,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
    let proxy = tokio::spawn(dummy_proxy(
        client,
        Some(server_config),
-        Scram::new("password").await?,
+        Scram::new("password")?,
    ));

    let _client_err = tokio_postgres::Config::new()
@@ -231,7 +231,7 @@ async fn connect_failure(
    let proxy = tokio::spawn(dummy_proxy(
        client,
        Some(server_config),
-        Scram::new("password").await?,
+        Scram::new("password")?,
    ));

    let _client_err = tokio_postgres::Config::new()
--- a/proxy/src/sasl.rs
+++ b/proxy/src/sasl.rs
@@ -33,9 +33,6 @@ pub enum Error {
    #[error("Internal error: missing digest")]
    MissingBinding,

-    #[error("could not decode salt: {0}")]
-    Base64(#[from] base64::DecodeError),
-
    #[error(transparent)]
    Io(#[from] io::Error),
 }
@@ -58,7 +55,6 @@ impl ReportableError for Error {
            Error::ChannelBindingBadMethod(_) => crate::error::ErrorKind::User,
            Error::BadClientMessage(_) => crate::error::ErrorKind::User,
            Error::MissingBinding => crate::error::ErrorKind::Service,
-            Error::Base64(_) => crate::error::ErrorKind::ControlPlane,
            Error::Io(_) => crate::error::ErrorKind::ClientDisconnect,
        }
    }
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -56,6 +56,8 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {

 #[cfg(test)]
 mod tests {
+    use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
+
    use crate::sasl::{Mechanism, Step};

    use super::{Exchange, ServerSecret};
@@ -112,10 +114,17 @@ mod tests {
    }

    async fn run_round_trip_test(server_password: &str, client_password: &str) {
-        let scram_secret = ServerSecret::build(server_password).await.unwrap();
-        let outcome = super::exchange(&scram_secret, client_password.as_bytes())
-            .await
-            .unwrap();
+        let scram_secret = ServerSecret::build(server_password).unwrap();
+        let sasl_client =
+            ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported());
+
+        let outcome = super::exchange(
+            &scram_secret,
+            sasl_client,
+            crate::config::TlsServerEndPoint::Undefined,
+        )
+        .await
+        .unwrap();

        match outcome {
            crate::sasl::Outcome::Success(_) => {}
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -2,16 +2,13 @@

 use std::convert::Infallible;

-use hmac::{Hmac, Mac};
-use sha2::Sha256;
-use tokio::task::yield_now;
+use postgres_protocol::authentication::sasl::ScramSha256;

 use super::messages::{
    ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN,
 };
 use super::secret::ServerSecret;
 use super::signature::SignatureBuilder;
-use super::ScramKey;
 use crate::config;
 use crate::sasl::{self, ChannelBinding, Error as SaslError};

@@ -74,62 +71,47 @@ impl<'a> Exchange<'a> {
    }
 }

-// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
-async fn pbkdf2(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] {
-    let hmac = Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
-    let mut prev = hmac
-        .clone()
-        .chain_update(salt)
-        .chain_update(1u32.to_be_bytes())
-        .finalize()
-        .into_bytes();
-
-    let mut hi = prev;
-
-    for i in 1..iterations {
-        prev = hmac.clone().chain_update(prev).finalize().into_bytes();
-
-        for (hi, prev) in hi.iter_mut().zip(prev) {
-            *hi ^= prev;
-        }
-        // yield every ~250us
-        // hopefully reduces tail latencies
-        if i % 1024 == 0 {
-            yield_now().await
-        }
-    }
-
-    hi.into()
-}
-
-// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L236-L248>
-async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> ScramKey {
-    let salted_password = pbkdf2(password, salt, iterations).await;
-
-    let make_key = |name| {
-        let key = Hmac::<Sha256>::new_from_slice(&salted_password)
-            .expect("HMAC is able to accept all key sizes")
-            .chain_update(name)
-            .finalize();
-
-        <[u8; 32]>::from(key.into_bytes())
-    };
-
-    make_key(b"Client Key").into()
-}
-
 pub async fn exchange(
    secret: &ServerSecret,
-    password: &[u8],
+    mut client: ScramSha256,
+    tls_server_end_point: config::TlsServerEndPoint,
 ) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
-    let salt = base64::decode(&secret.salt_base64)?;
-    let client_key = derive_client_key(password, &salt, secret.iterations).await;
+    use sasl::Step::*;

-    if secret.is_password_invalid(&client_key).into() {
-        Ok(sasl::Outcome::Failure("password doesn't match"))
-    } else {
-        Ok(sasl::Outcome::Success(client_key))
-    }
+    let init = SaslInitial {
+        nonce: rand::random,
+    };
+
+    let client_first = std::str::from_utf8(client.message())
+        .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
+    let sent = match init.transition(secret, &tls_server_end_point, client_first)? {
+        Continue(sent, server_first) => {
+            // `client.update` might perform `pbkdf2(pw)`, best to spawn it in a blocking thread.
+            // TODO(conrad): take this code from tokio-postgres and make an async-aware pbkdf2 impl
+            client = tokio::task::spawn_blocking(move || {
+                client.update(server_first.as_bytes())?;
+                Ok::<ScramSha256, std::io::Error>(client)
+            })
+            .await
+            .expect("should not panic while performing password hash")?;
+            sent
+        }
+        Success(x, _) => match x {},
+        Failure(msg) => return Ok(sasl::Outcome::Failure(msg)),
+    };
+
+    let client_final = std::str::from_utf8(client.message())
+        .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
+    let keys = match sent.transition(secret, &tls_server_end_point, client_final)? {
+        Success(keys, server_final) => {
+            client.finish(server_final.as_bytes())?;
+            keys
+        }
+        Continue(x, _) => match x {},
+        Failure(msg) => return Ok(sasl::Outcome::Failure(msg)),
+    };
+
+    Ok(sasl::Outcome::Success(keys))
 }

 impl SaslInitial {
@@ -210,7 +192,7 @@ impl SaslSentInner {
            .derive_client_key(&client_final_message.proof);

        // Auth fails either if keys don't match or it's pre-determined to fail.
-        if secret.is_password_invalid(&client_key).into() {
+        if client_key.sha256() != secret.stored_key || secret.doomed {
            return Ok(sasl::Step::Failure("password doesn't match"));
        }

--- a/proxy/src/scram/key.rs
+++ b/proxy/src/scram/key.rs
@@ -1,31 +1,17 @@
 //! Tools for client/server/stored key management.

-use subtle::ConstantTimeEq;
-
 /// Faithfully taken from PostgreSQL.
 pub const SCRAM_KEY_LEN: usize = 32;

 /// One of the keys derived from the user's password.
 /// We use the same structure for all keys, i.e.
 /// `ClientKey`, `StoredKey`, and `ServerKey`.
-#[derive(Clone, Default, Eq, Debug)]
+#[derive(Clone, Default, PartialEq, Eq, Debug)]
 #[repr(transparent)]
 pub struct ScramKey {
    bytes: [u8; SCRAM_KEY_LEN],
 }

-impl PartialEq for ScramKey {
-    fn eq(&self, other: &Self) -> bool {
-        self.ct_eq(other).into()
-    }
-}
-
-impl ConstantTimeEq for ScramKey {
-    fn ct_eq(&self, other: &Self) -> subtle::Choice {
-        self.bytes.ct_eq(&other.bytes)
-    }
-}
-
 impl ScramKey {
    pub fn sha256(&self) -> Self {
        super::sha256([self.as_ref()]).into()
--- a/proxy/src/scram/messages.rs
+++ b/proxy/src/scram/messages.rs
@@ -206,28 +206,6 @@ mod tests {
        }
    }

-    #[test]
-    fn parse_client_first_message_with_invalid_gs2_authz() {
-        assert!(ClientFirstMessage::parse("n,authzid,n=user,r=nonce").is_none())
-    }
-
-    #[test]
-    fn parse_client_first_message_with_extra_params() {
-        let msg = ClientFirstMessage::parse("n,,n=user,r=nonce,a=foo,b=bar,c=baz").unwrap();
-        assert_eq!(msg.bare, "n=user,r=nonce,a=foo,b=bar,c=baz");
-        assert_eq!(msg.username, "user");
-        assert_eq!(msg.nonce, "nonce");
-        assert_eq!(msg.cbind_flag, ChannelBinding::NotSupportedClient);
-    }
-
-    #[test]
-    fn parse_client_first_message_with_extra_params_invalid() {
-        // must be of the form `<ascii letter>=<...>`
-        assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,abc=foo").is_none());
-        assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,1=foo").is_none());
-        assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,a").is_none());
-    }
-
    #[test]
    fn parse_client_final_message() {
        let input = [
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -1,7 +1,5 @@
 //! Tools for SCRAM server secret management.

-use subtle::{Choice, ConstantTimeEq};
-
 use super::base64_decode_array;
 use super::key::ScramKey;

@@ -42,11 +40,6 @@ impl ServerSecret {
        Some(secret)
    }

-    pub fn is_password_invalid(&self, client_key: &ScramKey) -> Choice {
-        // constant time to not leak partial key match
-        client_key.sha256().ct_ne(&self.stored_key) | Choice::from(self.doomed as u8)
-    }
-
    /// To avoid revealing information to an attacker, we use a
    /// mocked server secret even if the user doesn't exist.
    /// See `auth-scram.c : mock_scram_secret` for details.
@@ -66,8 +59,10 @@ impl ServerSecret {
    /// Build a new server secret from the prerequisites.
    /// XXX: We only use this function in tests.
    #[cfg(test)]
-    pub async fn build(password: &str) -> Option<Self> {
-        Self::parse(&postgres_protocol::password::scram_sha_256(password.as_bytes()).await)
+    pub fn build(password: &str) -> Option<Self> {
+        Self::parse(&postgres_protocol::password::scram_sha_256(
+            password.as_bytes(),
+        ))
    }
 }

--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.77.0"
+channel = "1.76.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -225,7 +225,6 @@ async fn write_segment(
    assert!(from <= to);
    assert!(to <= wal_seg_size);

-    #[allow(clippy::suspicious_open_options)]
    let mut file = OpenOptions::new()
        .create(true)
        .write(true)
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -20,7 +20,7 @@ use std::io::Write as _;
 use tokio::sync::mpsc;
 use tokio_stream::wrappers::ReceiverStream;
 use tracing::{info_span, Instrument};
-use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWriter};
+use utils::http::endpoint::{request_span, ChannelWriter};

 use crate::debug_dump::TimelineDigestRequest;
 use crate::receive_wal::WalReceiverState;
@@ -515,7 +515,6 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
    router
        .data(Arc::new(conf))
        .data(auth)
-        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
        .get("/v1/status", |r| request_span(r, status_handler))
        .put("/v1/failpoints", |r| {
            request_span(r, move |r| async {
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -221,7 +221,6 @@ impl PhysicalStorage {
            // half initialized segment, first bake it under tmp filename and
            // then rename.
            let tmp_path = self.timeline_dir.join("waltmp");
-            #[allow(clippy::suspicious_open_options)]
            let mut file = OpenOptions::new()
                .create(true)
                .write(true)
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -51,7 +51,7 @@ from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pageserver.allowed_errors import (
    DEFAULT_PAGESERVER_ALLOWED_ERRORS,
-    DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
+    scan_pageserver_log_for_errors,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.types import IndexPartDump
@@ -77,7 +77,6 @@ from fixtures.utils import (
    ATTACHMENT_NAME_REGEX,
    allure_add_grafana_links,
    allure_attach_from_dir,
-    assert_no_errors,
    get_self_dir,
    subprocess_capture,
    wait_until,
@@ -945,8 +944,6 @@ class NeonEnvBuilder:
            for pageserver in self.env.pageservers:
                pageserver.assert_no_errors()

-            self.env.storage_controller.assert_no_errors()
-
        try:
            self.overlay_cleanup_teardown()
        except Exception as e:
@@ -1892,6 +1889,19 @@ class NeonCli(AbstractNeonCli):

        return self.raw_cli(args, check_return_code=True)

+    def tenant_migrate(
+        self, tenant_shard_id: TenantShardId, new_pageserver: int, timeout_secs: Optional[int]
+    ):
+        args = [
+            "tenant",
+            "migrate",
+            "--tenant-id",
+            str(tenant_shard_id),
+            "--id",
+            str(new_pageserver),
+        ]
+        return self.raw_cli(args, check_return_code=True, timeout=timeout_secs)
+
    def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]":
        return self.raw_cli(["start"], check_return_code=check_return_code)

@@ -1951,7 +1961,6 @@ class NeonStorageController(MetricsGetter):
        self.env = env
        self.running = False
        self.auth_enabled = auth_enabled
-        self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS

    def start(self):
        assert not self.running
@@ -1976,11 +1985,6 @@ class NeonStorageController(MetricsGetter):
                msg = ""
            raise StorageControllerApiException(msg, res.status_code) from e

-    def assert_no_errors(self):
-        assert_no_errors(
-            self.env.repo_dir / "storage_controller.log", "storage_controller", self.allowed_errors
-        )
-
    def pageserver_api(self) -> PageserverHttpClient:
        """
        The storage controller implements a subset of the pageserver REST API, for mapping
@@ -2143,25 +2147,13 @@ class NeonStorageController(MetricsGetter):
        """
        response = self.request(
            "GET",
-            f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/locate",
+            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/locate",
            headers=self.headers(TokenScope.ADMIN),
        )
        body = response.json()
        shards: list[dict[str, Any]] = body["shards"]
        return shards

-    def tenant_describe(self, tenant_id: TenantId):
-        """
-        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int}
-        """
-        response = self.request(
-            "GET",
-            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}",
-            headers=self.headers(TokenScope.ADMIN),
-        )
-        response.raise_for_status()
-        return response.json()
-
    def tenant_shard_split(
        self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None
    ) -> list[TenantShardId]:
@@ -2365,9 +2357,18 @@ class NeonPageserver(PgProtocol):
        return self.env.repo_dir / f"pageserver_{self.id}"

    def assert_no_errors(self):
-        assert_no_errors(
-            self.workdir / "pageserver.log", f"pageserver_{self.id}", self.allowed_errors
-        )
+        logfile = self.workdir / "pageserver.log"
+        if not logfile.exists():
+            log.warning(f"Skipping log check: {logfile} does not exist")
+            return
+
+        with logfile.open("r") as f:
+            errors = scan_pageserver_log_for_errors(f, self.allowed_errors)
+
+        for _lineno, error in errors:
+            log.info(f"not allowed error: {error.strip()}")
+
+        assert not errors

    def assert_no_metric_errors(self):
        """
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -89,16 +89,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
 )


-DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
-    # Many tests will take pageservers offline, resulting in log warnings on the controller
-    # failing to connect to them.
-    ".*Call to node.*management API.*failed.*receive body.*",
-    ".*Call to node.*management API.*failed.*ReceiveBody.*",
-    # Many tests will start up with a node offline
-    ".*startup_reconcile: Could not scan node.*",
-]
-
-
 def _check_allowed_errors(input):
    allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)

--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -626,17 +626,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        res_json = res.json()
        return res_json

-    def timeline_layer_map_info(
-        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
-    ):
-        log.info(f"Requesting layer map info of tenant {tenant_id}, timeline {timeline_id}")
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer",
-        )
-        self.verbose_error(res)
-        res_json = res.json()
-        return res_json
-
    def timeline_checkpoint(
        self,
        tenant_id: Union[TenantId, TenantShardId],
--- a/test_runner/fixtures/types.py
+++ b/test_runner/fixtures/types.py
@@ -158,9 +158,6 @@ class TenantShardId:
    def __str__(self):
        return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}"

-    def __repr__(self):
-        return self.__str__()
-
    def _tuple(self) -> tuple[TenantId, int, int]:
        return (self.tenant_id, self.shard_number, self.shard_count)

--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -11,7 +11,6 @@ from typing import (
    Any,
    Callable,
    Dict,
-    Iterable,
    List,
    Optional,
    Tuple,
@@ -448,39 +447,3 @@ def humantime_to_ms(humantime: str) -> float:
            )

    return round(total_ms, 3)
-
-
-def scan_log_for_errors(input: Iterable[str], allowed_errors: List[str]) -> List[Tuple[int, str]]:
-    error_or_warn = re.compile(r"\s(ERROR|WARN)")
-    errors = []
-    for lineno, line in enumerate(input, start=1):
-        if len(line) == 0:
-            continue
-
-        if error_or_warn.search(line):
-            # Is this a torn log line?  This happens when force-killing a process and restarting
-            # Example: "2023-10-25T09:38:31.752314Z  WARN deletion executo2023-10-25T09:38:31.875947Z  INFO version: git-env:0f9452f76e8ccdfc88291bccb3f53e3016f40192"
-            if re.match("\\d{4}-\\d{2}-\\d{2}T.+\\d{4}-\\d{2}-\\d{2}T.+INFO version.+", line):
-                continue
-
-            # It's an ERROR or WARN. Is it in the allow-list?
-            for a in allowed_errors:
-                if re.match(a, line):
-                    break
-            else:
-                errors.append((lineno, line))
-    return errors
-
-
-def assert_no_errors(log_file, service, allowed_errors):
-    if not log_file.exists():
-        log.warning(f"Skipping {service} log check: {log_file} does not exist")
-        return
-
-    with log_file.open("r") as f:
-        errors = scan_log_for_errors(f, allowed_errors)
-
-    for _lineno, error in errors:
-        log.info(f"not allowed {service} error: {error.strip()}")
-
-    assert not errors, f"Log errors on {service}: {errors[0]}"
--- a/test_runner/performance/test_gc_feedback.py
+++ b/test_runner/performance/test_gc_feedback.py
@@ -1,5 +1,3 @@
-import json
-
 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.log_helper import log
@@ -81,8 +79,3 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
    zenbenchmark.record(
        "physical/logical ratio", physical_size / logical_size, "", MetricReport.LOWER_IS_BETTER
    )
-
-    layer_map_path = env.repo_dir / "layer-map.json"
-    log.info(f"Writing layer map to {layer_map_path}")
-    with layer_map_path.open("w") as f:
-        f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id)))
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -120,12 +120,12 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
    env = neon_simple_env
    pageserver_http_client = env.pageserver.http_client()

-    error_regexes = [
-        ".*invalid branch start lsn: less than latest GC cutoff.*",
-        ".*invalid branch start lsn: less than planned GC cutoff.*",
-    ]
-    env.pageserver.allowed_errors.extend(error_regexes)
-    env.storage_controller.allowed_errors.extend(error_regexes)
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*invalid branch start lsn: less than latest GC cutoff.*",
+            ".*invalid branch start lsn: less than planned GC cutoff.*",
+        ]
+    )

    # Disable background GC but set the `pitr_interval` to be small, so GC can delete something
    tenant, _ = env.neon_cli.create_tenant(
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -14,12 +14,9 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
    env = neon_env_builder.init_start()

-    error_regexes = [
-        ".*invalid branch start lsn.*",
-        ".*invalid start lsn .* for ancestor timeline.*",
-    ]
-    env.pageserver.allowed_errors.extend(error_regexes)
-    env.storage_controller.allowed_errors.extend(error_regexes)
+    env.pageserver.allowed_errors.extend(
+        [".*invalid branch start lsn.*", ".*invalid start lsn .* for ancestor timeline.*"]
+    )

    # Branch at the point where only 100 rows were inserted
    branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind")
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -238,10 +238,6 @@ def test_forward_compatibility(
            pg_distrib_dir=compatibility_postgres_distrib_dir,
        )

-        # TODO: remove this workaround after release-5090 is no longer the most recent release.
-        # There was a bug in that code that generates a warning in the storage controller log.
-        env.storage_controller.allowed_errors.append(".*no tenant_shard_id specified.*")
-
        # Use current neon_local even though we're using old binaries for
        # everything else: our test code is written for latest CLI args.
        env.neon_local_binpath = neon_local_binpath
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -70,7 +70,6 @@ def test_metric_collection(
            # we have a fast rate of calculation, these can happen at shutdown
            ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*",
            ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes",
-            ".*metrics_collection: failed to upload to S3: Failed to upload data of length .* to storage path.*",
        ]
    )

--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -432,10 +432,6 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     - Eviction of layers on the attached location results in deletion
       on the secondary location as well.
    """
-
-    # For debug of https://github.com/neondatabase/neon/issues/6966
-    neon_env_builder.rust_log_override = "DEBUG"
-
    neon_env_builder.num_pageservers = 2
    neon_env_builder.enable_pageserver_remote_storage(
        remote_storage_kind=RemoteStorageKind.MOCK_S3,
@@ -580,7 +576,7 @@ def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controll
    timeline_id = TimelineId.generate()

    env.neon_cli.create_tenant(
-        tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}'
+        tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Double":1}'
    )

    attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1,6 +1,5 @@
 import os
 import time
-from collections import defaultdict
 from typing import Dict, List, Optional, Union

 import pytest
@@ -14,7 +13,7 @@ from fixtures.neon_fixtures import (
    tenant_get_shards,
 )
 from fixtures.remote_storage import s3_storage
-from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
+from fixtures.types import Lsn, TenantShardId, TimelineId
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
 from pytest_httpserver import HTTPServer
@@ -160,20 +159,11 @@ def test_sharding_split_smoke(

    neon_env_builder.preserve_database_files = True

-    non_default_tenant_config = {"gc_horizon": 77 * 1024 * 1024}
-
-    env = neon_env_builder.init_configs(True)
-    neon_env_builder.start()
-    tenant_id = TenantId.generate()
-    timeline_id = TimelineId.generate()
-    env.neon_cli.create_tenant(
-        tenant_id,
-        timeline_id,
-        shard_count=shard_count,
-        shard_stripe_size=stripe_size,
-        placement_policy='{"Attached": 1}',
-        conf=non_default_tenant_config,
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size
    )
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
    workload = Workload(env, tenant_id, timeline_id, branch_name="main")
    workload.init()

@@ -233,14 +223,6 @@ def test_sharding_split_smoke(
    # Before split, old shards exist
    assert shards_on_disk(old_shard_ids)

-    # Before split, we have done one reconcile for each shard
-    assert (
-        env.storage_controller.get_metric_value(
-            "storage_controller_reconcile_complete_total", filter={"status": "ok"}
-        )
-        == shard_count
-    )
-
    env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count)

    post_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)]
@@ -282,66 +264,43 @@ def test_sharding_split_smoke(
        destination = migrate_to_pageserver_ids.pop()

        log.info(f"Migrating shard {migrate_shard} from {ps_id} to {destination}")
-        env.storage_controller.tenant_shard_migrate(migrate_shard, destination)
+        env.neon_cli.tenant_migrate(migrate_shard, destination, timeout_secs=10)

    workload.validate()

-    # Assert on how many reconciles happened during the process.  This is something of an
-    # implementation detail, but it is useful to detect any bugs that might generate spurious
-    # extra reconcile iterations.
-    #
-    # We'll have:
-    # - shard_count reconciles for the original setup of the tenant
-    # - shard_count reconciles for detaching the original secondary locations during split
-    # - split_shard_count reconciles during shard splitting, for setting up secondaries.
-    # - shard_count reconciles for the migrations we did to move child shards away from their split location
-    expect_reconciles = shard_count * 2 + split_shard_count + shard_count
+    # Check that we didn't do any spurious reconciliations.
+    # Total number of reconciles should have been one per original shard, plus
+    # one for each shard that was migrated.
    reconcile_ok = env.storage_controller.get_metric_value(
        "storage_controller_reconcile_complete_total", filter={"status": "ok"}
    )
-    assert reconcile_ok == expect_reconciles
+    assert reconcile_ok == shard_count + split_shard_count // 2

    # Check that no cancelled or errored reconciliations occurred: this test does no
    # failure injection and should run clean.
-    cancelled_reconciles = env.storage_controller.get_metric_value(
-        "storage_controller_reconcile_complete_total", filter={"status": "cancel"}
+    assert (
+        env.storage_controller.get_metric_value(
+            "storage_controller_reconcile_complete_total", filter={"status": "cancel"}
+        )
+        is None
    )
-    errored_reconciles = env.storage_controller.get_metric_value(
-        "storage_controller_reconcile_complete_total", filter={"status": "error"}
+    assert (
+        env.storage_controller.get_metric_value(
+            "storage_controller_reconcile_complete_total", filter={"status": "error"}
+        )
+        is None
    )
-    assert cancelled_reconciles is not None and int(cancelled_reconciles) == 0
-    assert errored_reconciles is not None and int(errored_reconciles) == 0

    env.storage_controller.consistency_check()

-    def get_node_shard_counts(env: NeonEnv, tenant_ids):
-        total: defaultdict[int, int] = defaultdict(int)
-        attached: defaultdict[int, int] = defaultdict(int)
-        for tid in tenant_ids:
-            for shard in env.storage_controller.tenant_describe(tid)["shards"]:
-                log.info(
-                    f"{shard['tenant_shard_id']}: attached={shard['node_attached']}, secondary={shard['node_secondary']} "
-                )
-                for node in shard["node_secondary"]:
-                    total[int(node)] += 1
-                attached[int(shard["node_attached"])] += 1
-                total[int(shard["node_attached"])] += 1
+    # Validate pageserver state
+    shards_exist: list[TenantShardId] = []
+    for pageserver in env.pageservers:
+        locations = pageserver.http_client().tenant_list_locations()
+        shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"])

-        return total, attached
-
-    def check_effective_tenant_config():
-        # Expect our custom tenant configs to have survived the split
-        for shard in env.storage_controller.tenant_describe(tenant_id)["shards"]:
-            node = env.get_pageserver(int(shard["node_attached"]))
-            config = node.http_client().tenant_config(TenantShardId.parse(shard["tenant_shard_id"]))
-            for k, v in non_default_tenant_config.items():
-                assert config.effective_config[k] == v
-
-    # Validate pageserver state: expect every child shard to have an attached and secondary location
-    (total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id])
-    assert sum(attached.values()) == split_shard_count
-    assert sum(total.values()) == split_shard_count * 2
-    check_effective_tenant_config()
+    log.info("Shards after split: {shards_exist}")
+    assert len(shards_exist) == split_shard_count

    # Ensure post-split pageserver locations survive a restart (i.e. the child shards
    # correctly wrote config to disk, and the storage controller responds correctly
@@ -350,11 +309,13 @@ def test_sharding_split_smoke(
        pageserver.stop()
        pageserver.start()

-    # Validate pageserver state: expect every child shard to have an attached and secondary location
-    (total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id])
-    assert sum(attached.values()) == split_shard_count
-    assert sum(total.values()) == split_shard_count * 2
-    check_effective_tenant_config()
+    shards_exist = []
+    for pageserver in env.pageservers:
+        locations = pageserver.http_client().tenant_list_locations()
+        shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"])
+
+    log.info("Shards after restart: {shards_exist}")
+    assert len(shards_exist) == split_shard_count

    workload.validate()

@@ -760,32 +721,9 @@ def test_sharding_split_failures(
    initial_shard_count = 2
    split_shard_count = 4

-    env = neon_env_builder.init_configs()
-    env.start()
-
-    tenant_id = TenantId.generate()
-    timeline_id = TimelineId.generate()
-
-    # Create a tenant with secondary locations enabled
-    env.neon_cli.create_tenant(
-        tenant_id, timeline_id, shard_count=initial_shard_count, placement_policy='{"Attached":1}'
-    )
-
-    env.storage_controller.allowed_errors.extend(
-        [
-            # All split failures log a warning when then enqueue the abort operation
-            ".*Enqueuing background abort.*",
-            # We exercise failure cases where abort itself will also fail (node offline)
-            ".*abort_tenant_shard_split.*",
-            ".*Failed to abort.*",
-            # Tolerate any error lots that mention a failpoint
-            ".*failpoint.*",
-            # Node offline cases will fail to send requests
-            ".*Reconcile error: receive body: error sending request for url.*",
-            # Node offline cases will fail inside reconciler when detaching secondaries
-            ".*Reconcile error on shard.*: receive body: error sending request for url.*",
-        ]
-    )
+    env = neon_env_builder.init_start(initial_tenant_shard_count=initial_shard_count)
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline

    for ps in env.pageservers:
        # When we do node failures and abandon a shard, it will de-facto have old generation and
@@ -821,8 +759,7 @@ def test_sharding_split_failures(
    # will have succeeded: the net result should be to return to a clean state, including
    # detaching any child shards.
    def assert_rolled_back(exclude_ps_id=None) -> None:
-        secondary_count = 0
-        attached_count = 0
+        count = 0
        for ps in env.pageservers:
            if exclude_ps_id is not None and ps.id == exclude_ps_id:
                continue
@@ -830,25 +767,13 @@ def test_sharding_split_failures(
            locations = ps.http_client().tenant_list_locations()["tenant_shards"]
            for loc in locations:
                tenant_shard_id = TenantShardId.parse(loc[0])
-                log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
+                log.info(f"Shard {tenant_shard_id} seen on node {ps.id}")
                assert tenant_shard_id.shard_count == initial_shard_count
-                if loc[1]["mode"] == "Secondary":
-                    secondary_count += 1
-                else:
-                    attached_count += 1
-
-        if exclude_ps_id is not None:
-            # For a node failure case, we expect there to be a secondary location
-            # scheduled on the offline node, so expect one fewer secondary in total
-            assert secondary_count == initial_shard_count - 1
-        else:
-            assert secondary_count == initial_shard_count
-
-        assert attached_count == initial_shard_count
+                count += 1
+        assert count == initial_shard_count

    def assert_split_done(exclude_ps_id=None) -> None:
-        secondary_count = 0
-        attached_count = 0
+        count = 0
        for ps in env.pageservers:
            if exclude_ps_id is not None and ps.id == exclude_ps_id:
                continue
@@ -856,14 +781,10 @@ def test_sharding_split_failures(
            locations = ps.http_client().tenant_list_locations()["tenant_shards"]
            for loc in locations:
                tenant_shard_id = TenantShardId.parse(loc[0])
-                log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
+                log.info(f"Shard {tenant_shard_id} seen on node {ps.id}")
                assert tenant_shard_id.shard_count == split_shard_count
-                if loc[1]["mode"] == "Secondary":
-                    secondary_count += 1
-                else:
-                    attached_count += 1
-        assert attached_count == split_shard_count
-        assert secondary_count == split_shard_count
+                count += 1
+        assert count == split_shard_count

    def finish_split():
        # Having failed+rolled back, we should be able to split again
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -23,7 +23,7 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
-from fixtures.types import TenantId, TenantShardId, TimelineId
+from fixtures.types import TenantId, TimelineId
 from fixtures.utils import run_pg_bench_small, wait_until
 from mypy_boto3_s3.type_defs import (
    ObjectTypeDef,
@@ -177,7 +177,6 @@ def test_node_status_after_restart(
    assert len(nodes) == 2

    env.pageservers[1].stop()
-    env.storage_controller.allowed_errors.extend([".*Could not scan node"])

    env.storage_controller.stop()
    env.storage_controller.start()
@@ -682,9 +681,6 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
    tenant_id = TenantId.generate()
    body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}

-    env.storage_controller.allowed_errors.append(".*Unauthorized.*")
-    env.storage_controller.allowed_errors.append(".*Forbidden.*")
-
    # No token
    with pytest.raises(
        StorageControllerApiException,
@@ -847,12 +843,6 @@ def test_sharding_service_heartbeats(
    env = neon_env_builder.init_configs()
    env.start()

-    # Default log allow list permits connection errors, but this test will use error responses on
-    # the utilization endpoint.
-    env.storage_controller.allowed_errors.append(
-        ".*Call to node.*management API.*failed.*failpoint.*"
-    )
-
    # Initially we have two online pageservers
    nodes = env.storage_controller.node_list()
    assert len(nodes) == 2
@@ -948,65 +938,3 @@ def test_sharding_service_heartbeats(
        env.storage_controller.consistency_check()

    wait_until(10, 1, storage_controller_consistent)
-
-
-def test_sharding_service_re_attach(neon_env_builder: NeonEnvBuilder):
-    """
-    Exercise the behavior of the /re-attach endpoint on pageserver startup when
-    pageservers have a mixture of attached and secondary locations
-    """
-
-    neon_env_builder.num_pageservers = 2
-    env = neon_env_builder.init_configs()
-    env.start()
-
-    # We'll have two tenants.
-    tenant_a = TenantId.generate()
-    env.neon_cli.create_tenant(tenant_a, placement_policy='{"Attached":1}')
-    tenant_b = TenantId.generate()
-    env.neon_cli.create_tenant(tenant_b, placement_policy='{"Attached":1}')
-
-    # Each pageserver will have one attached and one secondary location
-    env.storage_controller.tenant_shard_migrate(
-        TenantShardId(tenant_a, 0, 0), env.pageservers[0].id
-    )
-    env.storage_controller.tenant_shard_migrate(
-        TenantShardId(tenant_b, 0, 0), env.pageservers[1].id
-    )
-
-    # Hard-fail a pageserver
-    victim_ps = env.pageservers[1]
-    survivor_ps = env.pageservers[0]
-    victim_ps.stop(immediate=True)
-
-    # Heatbeater will notice it's offline, and consequently attachments move to the other pageserver
-    def failed_over():
-        locations = survivor_ps.http_client().tenant_list_locations()["tenant_shards"]
-        log.info(f"locations: {locations}")
-        assert len(locations) == 2
-        assert all(loc[1]["mode"] == "AttachedSingle" for loc in locations)
-
-    # We could pre-empty this by configuring the node to Offline, but it's preferable to test
-    # the realistic path we would take when a node restarts uncleanly.
-    # The delay here will be ~NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL in neon_local
-    wait_until(30, 1, failed_over)
-
-    reconciles_before_restart = env.storage_controller.get_metric_value(
-        "storage_controller_reconcile_complete_total", filter={"status": "ok"}
-    )
-
-    # Restart the failed pageserver
-    victim_ps.start()
-
-    # We expect that the re-attach call correctly tipped off the pageserver that its locations
-    # are all secondaries now.
-    locations = victim_ps.http_client().tenant_list_locations()["tenant_shards"]
-    assert len(locations) == 2
-    assert all(loc[1]["mode"] == "Secondary" for loc in locations)
-
-    # We expect that this situation resulted from the re_attach call, and not any explicit
-    # Reconciler runs: assert that the reconciliation count has not gone up since we restarted.
-    reconciles_after_restart = env.storage_controller.get_metric_value(
-        "storage_controller_reconcile_complete_total", filter={"status": "ok"}
-    )
-    assert reconciles_after_restart == reconciles_before_restart
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -36,9 +36,7 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
    )
    [d for d in tenants_dir.iterdir()]

-    error_regexes = [".*tenant-config-before-write.*"]
-    neon_simple_env.pageserver.allowed_errors.extend(error_regexes)
-    neon_simple_env.storage_controller.allowed_errors.extend(error_regexes)
+    neon_simple_env.pageserver.allowed_errors.append(".*tenant-config-before-write.*")

    pageserver_http = neon_simple_env.pageserver.http_client()
    pageserver_http.configure_failpoints(("tenant-config-before-write", "return"))
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -20,7 +20,6 @@ from fixtures.neon_fixtures import (
    VanillaPostgres,
    wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
    assert_tenant_state,
    timeline_delete_wait_completed,
@@ -685,13 +684,6 @@ def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
    # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS


-def wait_for_tenant_startup_completions(client: PageserverHttpClient, count: int):
-    def condition():
-        assert client.get_metric_value("pageserver_tenant_startup_complete_total") == count
-
-    wait_until(5, 1.0, condition)
-
-
 def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
    """
    Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete
@@ -775,7 +767,10 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
    # That one that we successfully accessed is now Active
    expect_activated += 1
    assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active"
-    wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1)
+    assert (
+        pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
+        == expect_activated - 1
+    )

    # The ones we didn't touch are still in Attaching
    assert (
@@ -795,7 +790,10 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
        == n_tenants - expect_activated
    )

-    wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1)
+    assert (
+        pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
+        == expect_activated - 1
+    )

    # When we unblock logical size calculation, all tenants should proceed to active state via
    # the warmup route.
@@ -815,7 +813,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
    assert (
        pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
    )
-    wait_for_tenant_startup_completions(pageserver_http, count=n_tenants)
+    assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants

    # Check that tenant deletion/detach proactively wakes tenants: this is done separately to the main
    # body of the test because it will disrupt tenant counts
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -64,7 +64,6 @@ rustls = { version = "0.21", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
-sha2 = { version = "0.10", features = ["asm"] }
 smallvec = { version = "1", default-features = false, features = ["write"] }
 subtle = { version = "2" }
 time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }