diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 810c61de2d..2bcda7cc8e 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -461,6 +461,7 @@ jobs:
 
       - name: Pytest regression tests
         uses: ./.github/actions/run-python-test-set
+        timeout-minutes: 60
         with:
           build_type: ${{ matrix.build_type }}
           test_selection: regress
diff --git a/Cargo.lock b/Cargo.lock
index e35fa564b9..625958fdb8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -282,8 +282,10 @@ dependencies = [
  "control_plane",
  "diesel",
  "diesel_migrations",
+ "fail",
  "futures",
  "git-version",
+ "hex",
  "humantime",
  "hyper",
  "metrics",
@@ -1344,6 +1346,7 @@ dependencies = [
  "futures",
  "git-version",
  "hex",
+ "humantime",
  "hyper",
  "nix 0.27.1",
  "once_cell",
@@ -3527,6 +3530,7 @@ dependencies = [
  "postgres_connection",
  "postgres_ffi",
  "pq_proto",
+ "procfs",
  "rand 0.8.5",
  "regex",
  "remote_storage",
@@ -5342,13 +5346,23 @@ checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"
 
 [[package]]
 name = "sha2"
-version = "0.10.6"
+version = "0.10.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0"
+checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
 dependencies = [
  "cfg-if",
  "cpufeatures",
  "digest",
+ "sha2-asm",
+]
+
+[[package]]
+name = "sha2-asm"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f27ba7066011e3fb30d808b51affff34f0a66d3a03a58edd787c6e420e40e44e"
+dependencies = [
+ "cc",
 ]
 
 [[package]]
@@ -5884,7 +5898,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
 dependencies = [
  "futures",
  "nix 0.26.4",
@@ -6434,7 +6448,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
 dependencies = [
  "bytes",
  "io-uring",
@@ -6477,6 +6491,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "arc-swap",
+ "async-compression",
  "async-trait",
  "bincode",
  "byteorder",
@@ -6515,6 +6530,7 @@ dependencies = [
  "thiserror",
  "tokio",
  "tokio-stream",
+ "tokio-tar",
  "tokio-test",
  "tokio-util",
  "tracing",
@@ -6522,6 +6538,7 @@ dependencies = [
  "tracing-subscriber",
  "url",
  "uuid",
+ "walkdir",
  "workspace_hack",
 ]
 
@@ -7039,6 +7056,7 @@ dependencies = [
  "scopeguard",
  "serde",
  "serde_json",
+ "sha2",
  "smallvec",
  "subtle",
  "syn 1.0.109",
diff --git a/README.md b/README.md
index c44ae695d6..00a90f4483 100644
--- a/README.md
+++ b/README.md
@@ -238,6 +238,14 @@ If you encounter errors during setting up the initial tenant, it's best to stop
 
 ## Running tests
 
+### Rust unit tests
+
+We are using [`cargo-nextest`](https://nexte.st/) to run the tests in Github Workflows.
+Some crates do not support running plain `cargo test` anymore, prefer `cargo nextest run` instead.
+You can install `cargo-nextest` with `cargo install cargo-nextest`.
+
+### Integration tests
+
 Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
 
 ```sh
diff --git a/clippy.toml b/clippy.toml
index 5f7dc66152..4c0c04f9a1 100644
--- a/clippy.toml
+++ b/clippy.toml
@@ -2,6 +2,8 @@ disallowed-methods = [
     "tokio::task::block_in_place",
     # Allow this for now, to deny it later once we stop using Handle::block_on completely
     # "tokio::runtime::Handle::block_on",
+    # use tokio_epoll_uring_ext instead
+    "tokio_epoll_uring::thread_local_system",
 ]
 
 disallowed-macros = [
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index ba3a84cda8..3b596a88ff 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -743,19 +743,21 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
     // which may happen in two cases:
     // - extension was just installed
     // - extension was already installed and is up to date
-    let query = "ALTER EXTENSION neon UPDATE";
-    info!("update neon extension version with query: {}", query);
-    client.simple_query(query)?;
+    // DISABLED due to compute node unpinning epic
+    // let query = "ALTER EXTENSION neon UPDATE";
+    // info!("update neon extension version with query: {}", query);
+    // client.simple_query(query)?;
 
     Ok(())
 }
 
 #[instrument(skip_all)]
-pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
-    info!("handle neon extension upgrade");
-    let query = "ALTER EXTENSION neon UPDATE";
-    info!("update neon extension version with query: {}", query);
-    client.simple_query(query)?;
+pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
+    info!("handle neon extension upgrade (not really)");
+    // DISABLED due to compute node unpinning epic
+    // let query = "ALTER EXTENSION neon UPDATE";
+    // info!("update neon extension version with query: {}", query);
+    // client.simple_query(query)?;
 
     Ok(())
 }
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 75e5dcb7f8..b544a8c587 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -12,6 +12,7 @@ clap.workspace = true
 comfy-table.workspace = true
 futures.workspace = true
 git-version.workspace = true
+humantime.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index a5fad7216c..f78f56c480 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -19,8 +19,10 @@ aws-config.workspace = true
 aws-sdk-secretsmanager.workspace = true
 camino.workspace = true
 clap.workspace = true
+fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
+hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
 once_cell.workspace = true
diff --git a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
new file mode 100644
index 0000000000..897c7e0d01
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
@@ -0,0 +1,3 @@
+
+UPDATE tenant_shards set placement_policy='{"Double": 1}' where placement_policy='{"Attached": 1}';
+UPDATE tenant_shards set placement_policy='"Single"' where placement_policy='{"Attached": 0}';
\ No newline at end of file
diff --git a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql
new file mode 100644
index 0000000000..c898ac9aee
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql
@@ -0,0 +1,3 @@
+
+UPDATE tenant_shards set placement_policy='{"Attached": 1}' where placement_policy='{"Double": 1}';
+UPDATE tenant_shards set placement_policy='{"Attached": 0}' where placement_policy='"Single"';
\ No newline at end of file
diff --git a/control_plane/attachment_service/src/heartbeater.rs b/control_plane/attachment_service/src/heartbeater.rs
new file mode 100644
index 0000000000..e15de28920
--- /dev/null
+++ b/control_plane/attachment_service/src/heartbeater.rs
@@ -0,0 +1,227 @@
+use futures::{stream::FuturesUnordered, StreamExt};
+use std::{
+    collections::HashMap,
+    sync::Arc,
+    time::{Duration, Instant},
+};
+use tokio_util::sync::CancellationToken;
+
+use pageserver_api::{
+    controller_api::{NodeAvailability, UtilizationScore},
+    models::PageserverUtilization,
+};
+
+use thiserror::Error;
+use utils::id::NodeId;
+
+use crate::node::Node;
+
+struct HeartbeaterTask {
+    receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
+    cancel: CancellationToken,
+
+    state: HashMap<NodeId, PageserverState>,
+
+    max_unavailable_interval: Duration,
+    jwt_token: Option<String>,
+}
+
+#[derive(Debug, Clone)]
+pub(crate) enum PageserverState {
+    Available {
+        last_seen_at: Instant,
+        utilization: PageserverUtilization,
+    },
+    Offline,
+}
+
+#[derive(Debug)]
+pub(crate) struct AvailablityDeltas(pub Vec<(NodeId, PageserverState)>);
+
+#[derive(Debug, Error)]
+pub(crate) enum HeartbeaterError {
+    #[error("Cancelled")]
+    Cancel,
+}
+
+struct HeartbeatRequest {
+    pageservers: Arc<HashMap<NodeId, Node>>,
+    reply: tokio::sync::oneshot::Sender<Result<AvailablityDeltas, HeartbeaterError>>,
+}
+
+pub(crate) struct Heartbeater {
+    sender: tokio::sync::mpsc::UnboundedSender<HeartbeatRequest>,
+}
+
+impl Heartbeater {
+    pub(crate) fn new(
+        jwt_token: Option<String>,
+        max_unavailable_interval: Duration,
+        cancel: CancellationToken,
+    ) -> Self {
+        let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::<HeartbeatRequest>();
+        let mut heartbeater =
+            HeartbeaterTask::new(receiver, jwt_token, max_unavailable_interval, cancel);
+        tokio::task::spawn(async move { heartbeater.run().await });
+
+        Self { sender }
+    }
+
+    pub(crate) async fn heartbeat(
+        &self,
+        pageservers: Arc<HashMap<NodeId, Node>>,
+    ) -> Result<AvailablityDeltas, HeartbeaterError> {
+        let (sender, receiver) = tokio::sync::oneshot::channel();
+        self.sender
+            .send(HeartbeatRequest {
+                pageservers,
+                reply: sender,
+            })
+            .unwrap();
+
+        receiver.await.unwrap()
+    }
+}
+
+impl HeartbeaterTask {
+    fn new(
+        receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
+        jwt_token: Option<String>,
+        max_unavailable_interval: Duration,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            receiver,
+            cancel,
+            state: HashMap::new(),
+            max_unavailable_interval,
+            jwt_token,
+        }
+    }
+
+    async fn run(&mut self) {
+        loop {
+            tokio::select! {
+                request = self.receiver.recv() => {
+                    match request {
+                        Some(req) => {
+                            let res = self.heartbeat(req.pageservers).await;
+                            req.reply.send(res).unwrap();
+                        },
+                        None => { return; }
+                    }
+                },
+                _ = self.cancel.cancelled() => return
+            }
+        }
+    }
+
+    async fn heartbeat(
+        &mut self,
+        pageservers: Arc<HashMap<NodeId, Node>>,
+    ) -> Result<AvailablityDeltas, HeartbeaterError> {
+        let mut new_state = HashMap::new();
+
+        let mut heartbeat_futs = FuturesUnordered::new();
+        for (node_id, node) in &*pageservers {
+            heartbeat_futs.push({
+                let jwt_token = self.jwt_token.clone();
+                let cancel = self.cancel.clone();
+
+                // Clone the node and mark it as available such that the request
+                // goes through to the pageserver even when the node is marked offline.
+                // This doesn't impact the availability observed by [`crate::service::Service`].
+                let mut node = node.clone();
+                node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+
+                async move {
+                    let response = node
+                        .with_client_retries(
+                            |client| async move { client.get_utilization().await },
+                            &jwt_token,
+                            2,
+                            3,
+                            Duration::from_secs(1),
+                            &cancel,
+                        )
+                        .await;
+
+                    let response = match response {
+                        Some(r) => r,
+                        None => {
+                            // This indicates cancellation of the request.
+                            // We ignore the node in this case.
+                            return None;
+                        }
+                    };
+
+                    let status = if let Ok(utilization) = response {
+                        PageserverState::Available {
+                            last_seen_at: Instant::now(),
+                            utilization,
+                        }
+                    } else {
+                        PageserverState::Offline
+                    };
+
+                    Some((*node_id, status))
+                }
+            });
+
+            loop {
+                let maybe_status = tokio::select! {
+                    next = heartbeat_futs.next() => {
+                        match next {
+                            Some(result) => result,
+                            None => { break; }
+                        }
+                    },
+                    _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); }
+                };
+
+                if let Some((node_id, status)) = maybe_status {
+                    new_state.insert(node_id, status);
+                }
+            }
+        }
+
+        let mut deltas = Vec::new();
+        let now = Instant::now();
+        for (node_id, ps_state) in new_state {
+            use std::collections::hash_map::Entry::*;
+            let entry = self.state.entry(node_id);
+
+            let mut needs_update = false;
+            match entry {
+                Occupied(ref occ) => match (occ.get(), &ps_state) {
+                    (PageserverState::Offline, PageserverState::Offline) => {}
+                    (PageserverState::Available { last_seen_at, .. }, PageserverState::Offline) => {
+                        if now - *last_seen_at >= self.max_unavailable_interval {
+                            deltas.push((node_id, ps_state.clone()));
+                            needs_update = true;
+                        }
+                    }
+                    _ => {
+                        deltas.push((node_id, ps_state.clone()));
+                        needs_update = true;
+                    }
+                },
+                Vacant(_) => {
+                    deltas.push((node_id, ps_state.clone()));
+                }
+            }
+
+            match entry {
+                Occupied(mut occ) if needs_update => {
+                    (*occ.get_mut()) = ps_state;
+                }
+                Vacant(vac) => {
+                    vac.insert(ps_state);
+                }
+                _ => {}
+            }
+        }
+
+        Ok(AvailablityDeltas(deltas))
+    }
+}
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 27ba5bdb65..076b3a2f70 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -10,9 +10,11 @@ use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
+use tokio_util::sync::CancellationToken;
 use utils::auth::{Scope, SwappableJwtAuth};
+use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
-use utils::http::request::{must_get_query_param, parse_request_param};
+use utils::http::request::{must_get_query_param, parse_query_param, parse_request_param};
 use utils::id::{TenantId, TimelineId};
 
 use utils::{
@@ -26,7 +28,7 @@ use utils::{
 };
 
 use pageserver_api::controller_api::{
-    NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
+    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
 };
 use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
 
@@ -174,14 +176,14 @@ async fn handle_tenant_location_config(
     service: Arc<Service>,
     mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
     let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
     json_response(
         StatusCode::OK,
         service
-            .tenant_location_config(tenant_id, config_req)
+            .tenant_location_config(tenant_shard_id, config_req)
             .await?,
     )
 }
@@ -246,8 +248,10 @@ async fn handle_tenant_secondary_download(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    service.tenant_secondary_download(tenant_id).await?;
-    json_response(StatusCode::OK, ())
+    let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
+
+    let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
+    json_response(status, progress)
 }
 
 async fn handle_tenant_delete(
@@ -349,6 +353,16 @@ async fn handle_tenant_locate(
     json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }
 
+async fn handle_tenant_describe(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
+}
+
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -387,7 +401,14 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
 
     json_response(
         StatusCode::OK,
-        state.service.node_configure(config_req).await?,
+        state
+            .service
+            .node_configure(
+                config_req.node_id,
+                config_req.availability.map(NodeAvailability::from),
+                config_req.scheduling,
+            )
+            .await?,
     )
 }
 
@@ -548,14 +569,17 @@ pub fn make_router(
             request_span(r, handle_node_drop)
         })
         .get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
+        .get("/debug/v1/tenant/:tenant_id/locate", |r| {
+            tenant_service_handler(r, handle_tenant_locate)
+        })
         .get("/debug/v1/scheduler", |r| {
             request_span(r, handle_scheduler_dump)
         })
         .post("/debug/v1/consistency_check", |r| {
             request_span(r, handle_consistency_check)
         })
-        .get("/control/v1/tenant/:tenant_id/locate", |r| {
-            tenant_service_handler(r, handle_tenant_locate)
+        .put("/debug/v1/failpoints", |r| {
+            request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
         })
         // Node operations
         .post("/control/v1/node", |r| {
@@ -572,6 +596,9 @@ pub fn make_router(
         .put("/control/v1/tenant/:tenant_id/shard_split", |r| {
             tenant_service_handler(r, handle_tenant_shard_split)
         })
+        .get("/control/v1/tenant/:tenant_id", |r| {
+            tenant_service_handler(r, handle_tenant_describe)
+        })
         // Tenant operations
         // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
         // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
@@ -587,7 +614,7 @@ pub fn make_router(
         .get("/v1/tenant/:tenant_id/config", |r| {
             tenant_service_handler(r, handle_tenant_config_get)
         })
-        .put("/v1/tenant/:tenant_id/location_config", |r| {
+        .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
             tenant_service_handler(r, handle_tenant_location_config)
         })
         .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
diff --git a/control_plane/attachment_service/src/id_lock_map.rs b/control_plane/attachment_service/src/id_lock_map.rs
new file mode 100644
index 0000000000..b03700b50c
--- /dev/null
+++ b/control_plane/attachment_service/src/id_lock_map.rs
@@ -0,0 +1,54 @@
+use std::{collections::HashMap, sync::Arc};
+
+/// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't
+/// want to embed a lock in each one, or if your locking granularity is different to your object granularity.
+/// For example, used in the storage controller where the objects are tenant shards, but sometimes locking
+/// is needed at a tenant-wide granularity.
+pub(crate) struct IdLockMap<T>
+where
+    T: Eq + PartialEq + std::hash::Hash,
+{
+    /// A synchronous lock for getting/setting the async locks that our callers will wait on.
+    entities: std::sync::Mutex<std::collections::HashMap<T, Arc<tokio::sync::RwLock<()>>>>,
+}
+
+impl<T> IdLockMap<T>
+where
+    T: Eq + PartialEq + std::hash::Hash,
+{
+    pub(crate) fn shared(
+        &self,
+        key: T,
+    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<()>> {
+        let mut locked = self.entities.lock().unwrap();
+        let entry = locked.entry(key).or_default();
+        entry.clone().read_owned()
+    }
+
+    pub(crate) fn exclusive(
+        &self,
+        key: T,
+    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockWriteGuard<()>> {
+        let mut locked = self.entities.lock().unwrap();
+        let entry = locked.entry(key).or_default();
+        entry.clone().write_owned()
+    }
+
+    /// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do
+    /// periodic housekeeping to avoid the map growing indefinitely
+    pub(crate) fn housekeeping(&self) {
+        let mut locked = self.entities.lock().unwrap();
+        locked.retain(|_k, lock| lock.try_write().is_err())
+    }
+}
+
+impl<T> Default for IdLockMap<T>
+where
+    T: Eq + PartialEq + std::hash::Hash,
+{
+    fn default() -> Self {
+        Self {
+            entities: std::sync::Mutex::new(HashMap::new()),
+        }
+    }
+}
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index 796b465c10..4aff29f15b 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -3,7 +3,9 @@ use utils::seqwait::MonotonicCounter;
 
 mod auth;
 mod compute_hook;
+mod heartbeater;
 pub mod http;
+mod id_lock_map;
 pub mod metrics;
 mod node;
 pub mod persistence;
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index 333c3911e3..0a925a63f6 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -2,7 +2,7 @@ use anyhow::{anyhow, Context};
 use attachment_service::http::make_router;
 use attachment_service::metrics::preinitialize_metrics;
 use attachment_service::persistence::Persistence;
-use attachment_service::service::{Config, Service};
+use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
 use aws_config::{BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
@@ -54,6 +54,10 @@ struct Cli {
     /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
     #[arg(long)]
     database_url: Option<String>,
+
+    /// Grace period before marking unresponsive pageserver offline
+    #[arg(long)]
+    max_unavailable_interval: Option<humantime::Duration>,
 }
 
 /// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
@@ -206,6 +210,12 @@ async fn migration_run(database_url: &str) -> anyhow::Result<()> {
 }
 
 fn main() -> anyhow::Result<()> {
+    let default_panic = std::panic::take_hook();
+    std::panic::set_hook(Box::new(move |info| {
+        default_panic(info);
+        std::process::exit(1);
+    }));
+
     tokio::runtime::Builder::new_current_thread()
         // We use spawn_blocking for database operations, so require approximately
         // as many blocking threads as we will open database connections.
@@ -243,6 +253,10 @@ async fn async_main() -> anyhow::Result<()> {
         jwt_token: secrets.jwt_token,
         control_plane_jwt_token: secrets.control_plane_jwt_token,
         compute_hook_url: args.compute_hook_url,
+        max_unavailable_interval: args
+            .max_unavailable_interval
+            .map(humantime::Duration::into)
+            .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
     };
 
     // After loading secrets & config, but before starting anything else, apply database migrations
diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs
index 27b03608fa..4167782715 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -12,7 +12,7 @@ use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use utils::{backoff, id::NodeId};
 
-use crate::persistence::NodePersistence;
+use crate::{persistence::NodePersistence, scheduler::MaySchedule};
 
 /// Represents the in-memory description of a Node.
 ///
@@ -83,29 +83,38 @@ impl Node {
         }
     }
 
-    pub(crate) fn set_availability(
-        &mut self,
-        availability: NodeAvailability,
-    ) -> AvailabilityTransition {
-        use NodeAvailability::*;
-        let transition = match (self.availability, availability) {
-            (Offline, Active) => {
+    pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
+        match self.get_availability_transition(availability) {
+            AvailabilityTransition::ToActive => {
                 // Give the node a new cancellation token, effectively resetting it to un-cancelled.  Any
                 // users of previously-cloned copies of the node will still see the old cancellation
                 // state.  For example, Reconcilers in flight will have to complete and be spawned
                 // again to realize that the node has become available.
                 self.cancel = CancellationToken::new();
-                AvailabilityTransition::ToActive
             }
-            (Active, Offline) => {
+            AvailabilityTransition::ToOffline => {
                 // Fire the node's cancellation token to cancel any in-flight API requests to it
                 self.cancel.cancel();
-                AvailabilityTransition::ToOffline
             }
-            _ => AvailabilityTransition::Unchanged,
-        };
+            AvailabilityTransition::Unchanged => {}
+        }
         self.availability = availability;
-        transition
+    }
+
+    /// Without modifying the availability of the node, convert the intended availability
+    /// into a description of the transition.
+    pub(crate) fn get_availability_transition(
+        &self,
+        availability: NodeAvailability,
+    ) -> AvailabilityTransition {
+        use AvailabilityTransition::*;
+        use NodeAvailability::*;
+
+        match (self.availability, availability) {
+            (Offline, Active(_)) => ToActive,
+            (Active(_), Offline) => ToOffline,
+            _ => Unchanged,
+        }
     }
 
     /// Whether we may send API requests to this node.
@@ -114,21 +123,21 @@ impl Node {
         // a reference to the original Node's cancellation status.  Checking both of these results
         // in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable
         // when we cloned it, or if the original Node instance's cancellation token was fired.
-        matches!(self.availability, NodeAvailability::Active) && !self.cancel.is_cancelled()
+        matches!(self.availability, NodeAvailability::Active(_)) && !self.cancel.is_cancelled()
     }
 
     /// Is this node elegible to have work scheduled onto it?
-    pub(crate) fn may_schedule(&self) -> bool {
-        match self.availability {
-            NodeAvailability::Active => {}
-            NodeAvailability::Offline => return false,
-        }
+    pub(crate) fn may_schedule(&self) -> MaySchedule {
+        let score = match self.availability {
+            NodeAvailability::Active(score) => score,
+            NodeAvailability::Offline => return MaySchedule::No,
+        };
 
         match self.scheduling {
-            NodeSchedulingPolicy::Active => true,
-            NodeSchedulingPolicy::Draining => false,
-            NodeSchedulingPolicy::Filling => true,
-            NodeSchedulingPolicy::Pause => false,
+            NodeSchedulingPolicy::Active => MaySchedule::Yes(score),
+            NodeSchedulingPolicy::Draining => MaySchedule::No,
+            NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
+            NodeSchedulingPolicy::Pause => MaySchedule::No,
         }
     }
 
@@ -146,8 +155,7 @@ impl Node {
             listen_pg_addr,
             listen_pg_port,
             scheduling: NodeSchedulingPolicy::Filling,
-            // TODO: we shouldn't really call this Active until we've heartbeated it.
-            availability: NodeAvailability::Active,
+            availability: NodeAvailability::Offline,
             cancel: CancellationToken::new(),
         }
     }
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index aa08945834..209d8ff075 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -11,6 +11,9 @@ use diesel::prelude::*;
 use diesel::Connection;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
+use pageserver_api::shard::ShardConfigError;
+use pageserver_api::shard::ShardIdentity;
+use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
@@ -72,6 +75,14 @@ pub(crate) enum DatabaseError {
     Logical(String),
 }
 
+#[must_use]
+pub(crate) enum AbortShardSplitStatus {
+    /// We aborted the split in the database by reverting to the parent shards
+    Aborted,
+    /// The split had already been persisted.
+    Complete,
+}
+
 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
 
 impl Persistence {
@@ -200,15 +211,10 @@ impl Persistence {
 
         let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
             .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
-        for (tenant_id, tenant) in &mut decoded.tenants {
-            // Backward compat: an old attachments.json from before PR #6251, replace
-            // empty strings with proper defaults.
-            if tenant.tenant_id.is_empty() {
-                tenant.tenant_id = tenant_id.to_string();
-                tenant.config = serde_json::to_string(&TenantConfig::default())
-                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
-                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single)
-                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
+        for shard in decoded.tenants.values_mut() {
+            if shard.placement_policy == "\"Single\"" {
+                // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
+                shard.placement_policy = "{\"Attached\":0}".to_string();
             }
         }
 
@@ -570,6 +576,51 @@ impl Persistence {
         })
         .await
     }
+
+    /// Used when the remote part of a shard split failed: we will revert the database state to have only
+    /// the parent shards, with SplitState::Idle.
+    pub(crate) async fn abort_shard_split(
+        &self,
+        split_tenant_id: TenantId,
+        new_shard_count: ShardCount,
+    ) -> DatabaseResult<AbortShardSplitStatus> {
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| -> DatabaseResult<AbortShardSplitStatus> {
+            let aborted = conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
+                // Clear the splitting state on parent shards
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.ne(new_shard_count.literal() as i32))
+                    .set((splitting.eq(0),))
+                    .execute(conn)?;
+
+                // Parent shards are already gone: we cannot abort.
+                if updated == 0 {
+                    return Ok(AbortShardSplitStatus::Complete);
+                }
+
+                // Sanity check: if parent shards were present, their cardinality should
+                // be less than the number of child shards.
+                if updated >= new_shard_count.count() as usize {
+                    return Err(DatabaseError::Logical(format!(
+                        "Unexpected parent shard count {updated} while aborting split to \
+                            count {new_shard_count:?} on tenant {split_tenant_id}"
+                    )));
+                }
+
+                // Erase child shards
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(new_shard_count.literal() as i32))
+                    .execute(conn)?;
+
+                Ok(AbortShardSplitStatus::Aborted)
+            })?;
+
+            Ok(aborted)
+        })
+        .await
+    }
 }
 
 /// Parts of [`crate::tenant_state::TenantState`] that are stored durably
@@ -604,6 +655,28 @@ pub(crate) struct TenantShardPersistence {
     pub(crate) config: String,
 }
 
+impl TenantShardPersistence {
+    pub(crate) fn get_shard_identity(&self) -> Result<ShardIdentity, ShardConfigError> {
+        if self.shard_count == 0 {
+            Ok(ShardIdentity::unsharded())
+        } else {
+            Ok(ShardIdentity::new(
+                ShardNumber(self.shard_number as u8),
+                ShardCount::new(self.shard_count as u8),
+                ShardStripeSize(self.shard_stripe_size as u32),
+            )?)
+        }
+    }
+
+    pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
+        Ok(TenantShardId {
+            tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
+            shard_number: ShardNumber(self.shard_number as u8),
+            shard_count: ShardCount::new(self.shard_count as u8),
+        })
+    }
+}
+
 /// Parts of [`crate::node::Node`] that are stored durably
 #[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)]
 #[diesel(table_name = crate::schema::nodes)]
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index 603da9bf02..f00f35c74b 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,5 +1,6 @@
 use crate::persistence::Persistence;
 use crate::service;
+use hyper::StatusCode;
 use pageserver_api::models::{
     LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
@@ -7,7 +8,7 @@ use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_client::mgmt_api;
 use std::collections::HashMap;
 use std::sync::Arc;
-use std::time::Duration;
+use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
@@ -18,6 +19,8 @@ use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
 use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
 
+const DEFAULT_HEATMAP_PERIOD: &str = "60s";
+
 /// Object with the lifetime of the background reconcile task that is created
 /// for tenants which have a difference between their intent and observed states.
 pub(super) struct Reconciler {
@@ -255,22 +258,81 @@ impl Reconciler {
         tenant_shard_id: TenantShardId,
         node: &Node,
     ) -> Result<(), ReconcileError> {
-        match node
-            .with_client_retries(
-                |client| async move { client.tenant_secondary_download(tenant_shard_id).await },
-                &self.service_config.jwt_token,
-                1,
-                1,
-                Duration::from_secs(60),
-                &self.cancel,
-            )
-            .await
-        {
-            None => Err(ReconcileError::Cancel),
-            Some(Ok(_)) => Ok(()),
-            Some(Err(e)) => {
-                tracing::info!("  (skipping destination download: {})", e);
-                Ok(())
+        // This is not the timeout for a request, but the total amount of time we're willing to wait
+        // for a secondary location to get up to date before
+        const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300);
+
+        // This the long-polling interval for the secondary download requests we send to destination pageserver
+        // during a migration.
+        const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20);
+
+        let started_at = Instant::now();
+
+        loop {
+            let (status, progress) = match node
+                .with_client_retries(
+                    |client| async move {
+                        client
+                            .tenant_secondary_download(
+                                tenant_shard_id,
+                                Some(REQUEST_DOWNLOAD_TIMEOUT),
+                            )
+                            .await
+                    },
+                    &self.service_config.jwt_token,
+                    1,
+                    3,
+                    REQUEST_DOWNLOAD_TIMEOUT * 2,
+                    &self.cancel,
+                )
+                .await
+            {
+                None => Err(ReconcileError::Cancel),
+                Some(Ok(v)) => Ok(v),
+                Some(Err(e)) => {
+                    // Give up, but proceed: it's unfortunate if we couldn't freshen the destination before
+                    // attaching, but we should not let an issue with a secondary location stop us proceeding
+                    // with a live migration.
+                    tracing::warn!("Failed to prepare by downloading layers on node {node}: {e})");
+                    return Ok(());
+                }
+            }?;
+
+            if status == StatusCode::OK {
+                tracing::info!(
+                    "Downloads to {} complete: {}/{} layers, {}/{} bytes",
+                    node,
+                    progress.layers_downloaded,
+                    progress.layers_total,
+                    progress.bytes_downloaded,
+                    progress.bytes_total
+                );
+                return Ok(());
+            } else if status == StatusCode::ACCEPTED {
+                let total_runtime = started_at.elapsed();
+                if total_runtime > TOTAL_DOWNLOAD_TIMEOUT {
+                    tracing::warn!("Timed out after {}ms downloading layers to {node}.  Progress so far: {}/{} layers, {}/{} bytes",
+                        total_runtime.as_millis(),
+                        progress.layers_downloaded,
+                        progress.layers_total,
+                        progress.bytes_downloaded,
+                        progress.bytes_total
+                    );
+                    // Give up, but proceed: an incompletely warmed destination doesn't prevent migration working,
+                    // it just makes the I/O performance for users less good.
+                    return Ok(());
+                }
+
+                // Log and proceed around the loop to retry.  We don't sleep between requests, because our HTTP call
+                // to the pageserver is a long-poll.
+                tracing::info!(
+                    "Downloads to {} not yet complete: {}/{} layers, {}/{} bytes",
+                    node,
+                    progress.layers_downloaded,
+                    progress.layers_total,
+                    progress.bytes_downloaded,
+                    progress.bytes_total
+                );
             }
         }
     }
@@ -413,7 +475,7 @@ impl Reconciler {
             }
         }
 
-        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Single, then
+        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Attached(0), then
         // this location will be deleted in the general case reconciliation that runs after this.
         let origin_secondary_conf = build_location_config(
             &self.shard,
@@ -485,17 +547,29 @@ impl Reconciler {
                 )
                 .await
             {
-                Some(Ok(observed)) => observed,
+                Some(Ok(observed)) => Some(observed),
+                Some(Err(mgmt_api::Error::ApiError(status, _msg)))
+                    if status == StatusCode::NOT_FOUND =>
+                {
+                    None
+                }
                 Some(Err(e)) => return Err(e.into()),
                 None => return Err(ReconcileError::Cancel),
             };
             tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}");
-            self.observed.locations.insert(
-                attached_node.get_id(),
-                ObservedStateLocation {
-                    conf: observed_conf,
-                },
-            );
+            match observed_conf {
+                Some(conf) => {
+                    // Pageserver returned a state: update it in observed.  This may still be an indeterminate (None) state,
+                    // if internally the pageserver's TenantSlot was being mutated (e.g. some long running API call is still running)
+                    self.observed
+                        .locations
+                        .insert(attached_node.get_id(), ObservedStateLocation { conf });
+                }
+                None => {
+                    // Pageserver returned 404: we have confirmation that there is no state for this shard on that pageserver.
+                    self.observed.locations.remove(&attached_node.get_id());
+                }
+            }
         }
 
         Ok(())
@@ -525,7 +599,12 @@ impl Reconciler {
                 )));
             };
 
-            let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
+            let mut wanted_conf = attached_location_conf(
+                generation,
+                &self.shard,
+                &self.config,
+                !self.intent.secondary.is_empty(),
+            );
             match self.observed.locations.get(&node.get_id()) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                     // Nothing to do
@@ -662,10 +741,26 @@ impl Reconciler {
     }
 }
 
+/// We tweak the externally-set TenantConfig while configuring
+/// locations, using our awareness of whether secondary locations
+/// are in use to automatically enable/disable heatmap uploads.
+fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig {
+    let mut config = config.clone();
+    if has_secondaries {
+        if config.heatmap_period.is_none() {
+            config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string());
+        }
+    } else {
+        config.heatmap_period = None;
+    }
+    config
+}
+
 pub(crate) fn attached_location_conf(
     generation: Generation,
     shard: &ShardIdentity,
     config: &TenantConfig,
+    has_secondaries: bool,
 ) -> LocationConfig {
     LocationConfig {
         mode: LocationConfigMode::AttachedSingle,
@@ -674,7 +769,7 @@ pub(crate) fn attached_location_conf(
         shard_number: shard.number.0,
         shard_count: shard.count.literal(),
         shard_stripe_size: shard.stripe_size.0,
-        tenant_conf: config.clone(),
+        tenant_conf: ha_aware_config(config, has_secondaries),
     }
 }
 
@@ -689,6 +784,6 @@ pub(crate) fn secondary_location_conf(
         shard_number: shard.number.0,
         shard_count: shard.count.literal(),
         shard_stripe_size: shard.stripe_size.0,
-        tenant_conf: config.clone(),
+        tenant_conf: ha_aware_config(config, true),
     }
 }
diff --git a/control_plane/attachment_service/src/scheduler.rs b/control_plane/attachment_service/src/scheduler.rs
index 26a2707e8d..981ba26cce 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -1,4 +1,5 @@
 use crate::{node::Node, tenant_state::TenantState};
+use pageserver_api::controller_api::UtilizationScore;
 use serde::Serialize;
 use std::collections::HashMap;
 use utils::{http::error::ApiError, id::NodeId};
@@ -19,15 +20,34 @@ impl From<ScheduleError> for ApiError {
 }
 
 #[derive(Serialize, Eq, PartialEq)]
+pub enum MaySchedule {
+    Yes(UtilizationScore),
+    No,
+}
+
+#[derive(Serialize)]
 struct SchedulerNode {
     /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
     shard_count: usize,
 
     /// Whether this node is currently elegible to have new shards scheduled (this is derived
     /// from a node's availability state and scheduling policy).
-    may_schedule: bool,
+    may_schedule: MaySchedule,
 }
 
+impl PartialEq for SchedulerNode {
+    fn eq(&self, other: &Self) -> bool {
+        let may_schedule_matches = matches!(
+            (&self.may_schedule, &other.may_schedule),
+            (MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No)
+        );
+
+        may_schedule_matches && self.shard_count == other.shard_count
+    }
+}
+
+impl Eq for SchedulerNode {}
+
 /// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
 /// on which to run.
 ///
@@ -186,13 +206,15 @@ impl Scheduler {
             return None;
         }
 
+        // TODO: When the utilization score returned by the pageserver becomes meaningful,
+        // schedule based on that instead of the shard count.
         let node = nodes
             .iter()
             .map(|node_id| {
                 let may_schedule = self
                     .nodes
                     .get(node_id)
-                    .map(|n| n.may_schedule)
+                    .map(|n| n.may_schedule != MaySchedule::No)
                     .unwrap_or(false);
                 (*node_id, may_schedule)
             })
@@ -211,7 +233,7 @@ impl Scheduler {
             .nodes
             .iter()
             .filter_map(|(k, v)| {
-                if hard_exclude.contains(k) || !v.may_schedule {
+                if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
                     None
                 } else {
                     Some((*k, v.shard_count))
@@ -230,7 +252,7 @@ impl Scheduler {
             for (node_id, node) in &self.nodes {
                 tracing::info!(
                     "Node {node_id}: may_schedule={} shards={}",
-                    node.may_schedule,
+                    node.may_schedule != MaySchedule::No,
                     node.shard_count
                 );
             }
@@ -255,6 +277,7 @@ impl Scheduler {
 pub(crate) mod test_utils {
 
     use crate::node::Node;
+    use pageserver_api::controller_api::{NodeAvailability, UtilizationScore};
     use std::collections::HashMap;
     use utils::id::NodeId;
     /// Test helper: synthesize the requested number of nodes, all in active state.
@@ -264,13 +287,14 @@ pub(crate) mod test_utils {
         (1..n + 1)
             .map(|i| {
                 (NodeId(i), {
-                    let node = Node::new(
+                    let mut node = Node::new(
                         NodeId(i),
                         format!("httphost-{i}"),
                         80 + i as u16,
                         format!("pghost-{i}"),
                         5432 + i as u16,
                     );
+                    node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
                     assert!(node.is_available());
                     node
                 })
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index ea301d0372..e38007c7af 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -7,6 +7,9 @@ use std::{
     time::{Duration, Instant},
 };
 
+use crate::{
+    id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError,
+};
 use anyhow::Context;
 use control_plane::storage_controller::{
     AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
@@ -16,18 +19,21 @@ use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
 use pageserver_api::{
     controller_api::{
-        NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, PlacementPolicy,
-        TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
-        TenantShardMigrateRequest, TenantShardMigrateResponse,
+        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
+        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
+        TenantDescribeResponseShard, TenantLocateResponse, TenantShardMigrateRequest,
+        TenantShardMigrateResponse, UtilizationScore,
     },
-    models::TenantConfigRequest,
+    models::{SecondaryProgress, TenantConfigRequest},
 };
+
 use pageserver_api::{
     models::{
-        self, LocationConfig, LocationConfigListResponse, LocationConfigMode, ShardParameters,
-        TenantConfig, TenantCreateRequest, TenantLocationConfigRequest,
-        TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest,
-        TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo,
+        self, LocationConfig, LocationConfigListResponse, LocationConfigMode,
+        PageserverUtilization, ShardParameters, TenantConfig, TenantCreateRequest,
+        TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
+        TenantShardSplitRequest, TenantShardSplitResponse, TenantTimeTravelRequest,
+        TimelineCreateRequest, TimelineInfo,
     },
     shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
     upcall_api::{
@@ -36,6 +42,7 @@ use pageserver_api::{
     },
 };
 use pageserver_client::mgmt_api;
+use tokio::sync::OwnedRwLockWriteGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::instrument;
 use utils::{
@@ -49,6 +56,7 @@ use utils::{
 
 use crate::{
     compute_hook::{self, ComputeHook},
+    heartbeater::{Heartbeater, PageserverState},
     node::{AvailabilityTransition, Node},
     persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence},
     reconciler::attached_location_conf,
@@ -76,6 +84,8 @@ const INITIAL_GENERATION: Generation = Generation::new(0);
 /// up on unresponsive pageservers and proceed.
 pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 
+pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
+
 // Top level state available to all HTTP handlers
 struct ServiceState {
     tenants: BTreeMap<TenantShardId, TenantState>,
@@ -123,6 +133,11 @@ pub struct Config {
     /// (this URL points to the control plane in prod). If this is None, the compute hook will
     /// assume it is running in a test environment and try to update neon_local.
     pub compute_hook_url: Option<String>,
+
+    /// Grace period within which a pageserver does not respond to heartbeats, but is still
+    /// considered active. Once the grace period elapses, the next heartbeat failure will
+    /// mark the pagseserver offline.
+    pub max_unavailable_interval: Duration,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -147,6 +162,20 @@ pub struct Service {
     compute_hook: Arc<ComputeHook>,
     result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
 
+    heartbeater: Heartbeater,
+
+    // Channel for background cleanup from failed operations that require cleanup, such as shard split
+    abort_tx: tokio::sync::mpsc::UnboundedSender<TenantShardSplitAbort>,
+
+    // Locking on a tenant granularity (covers all shards in the tenant):
+    // - Take exclusively for rare operations that mutate the tenant's persistent state (e.g. create/delete/split)
+    // - Take in shared mode for operations that need the set of shards to stay the same to complete reliably (e.g. timeline CRUD)
+    tenant_op_locks: IdLockMap<TenantId>,
+
+    // Locking for node-mutating operations: take exclusively for operations that modify the node's persistent state, or
+    // that transition it to/from Active.
+    node_op_locks: IdLockMap<NodeId>,
+
     // Process shutdown will fire this token
     cancel: CancellationToken,
 
@@ -174,6 +203,50 @@ enum TenantCreateOrUpdate {
     Update(Vec<ShardUpdate>),
 }
 
+struct ShardSplitParams {
+    old_shard_count: ShardCount,
+    new_shard_count: ShardCount,
+    new_stripe_size: Option<ShardStripeSize>,
+    targets: Vec<ShardSplitTarget>,
+    policy: PlacementPolicy,
+    shard_ident: ShardIdentity,
+}
+
+// When preparing for a shard split, we may either choose to proceed with the split,
+// or find that the work is already done and return NoOp.
+enum ShardSplitAction {
+    Split(ShardSplitParams),
+    NoOp(TenantShardSplitResponse),
+}
+
+// A parent shard which will be split
+struct ShardSplitTarget {
+    parent_id: TenantShardId,
+    node: Node,
+    child_ids: Vec<TenantShardId>,
+}
+
+/// When we tenant shard split operation fails, we may not be able to clean up immediately, because nodes
+/// might not be available.  We therefore use a queue of abort operations processed in the background.
+struct TenantShardSplitAbort {
+    tenant_id: TenantId,
+    /// The target values from the request that failed
+    new_shard_count: ShardCount,
+    new_stripe_size: Option<ShardStripeSize>,
+    /// Until this abort op is complete, no other operations may be done on the tenant
+    _tenant_lock: tokio::sync::OwnedRwLockWriteGuard<()>,
+}
+
+#[derive(thiserror::Error, Debug)]
+enum TenantShardSplitAbortError {
+    #[error(transparent)]
+    Database(#[from] DatabaseError),
+    #[error(transparent)]
+    Remote(#[from] mgmt_api::Error),
+    #[error("Unavailable")]
+    Unavailable,
+}
+
 struct ShardUpdate {
     tenant_shard_id: TenantShardId,
     placement_policy: PlacementPolicy,
@@ -197,8 +270,6 @@ impl Service {
         let mut observed: HashMap<TenantShardId, Vec<(NodeId, Option<LocationConfig>)>> =
             HashMap::new();
 
-        let mut nodes_online = HashSet::new();
-
         // Startup reconciliation does I/O to other services: whether they
         // are responsive or not, we should aim to finish within our deadline, because:
         // - If we don't, a k8s readiness hook watching /ready will kill us.
@@ -220,6 +291,9 @@ impl Service {
         let mut cleanup = Vec::new();
 
         let node_listings = self.scan_node_locations(node_scan_deadline).await;
+        // Send initial heartbeat requests to nodes that replied to the location listing above.
+        let nodes_online = self.initial_heartbeat_round(node_listings.keys()).await;
+
         for (node_id, list_response) in node_listings {
             let tenant_shards = list_response.tenant_shards;
             tracing::info!(
@@ -227,7 +301,6 @@ impl Service {
                 tenant_shards.len(),
                 node_id
             );
-            nodes_online.insert(node_id);
 
             for (tenant_shard_id, conf_opt) in tenant_shards {
                 let shard_observations = observed.entry(tenant_shard_id).or_default();
@@ -246,8 +319,10 @@ impl Service {
             // Mark nodes online if they responded to us: nodes are offline by default after a restart.
             let mut new_nodes = (**nodes).clone();
             for (node_id, node) in new_nodes.iter_mut() {
-                if nodes_online.contains(node_id) {
-                    node.set_availability(NodeAvailability::Active);
+                if let Some(utilization) = nodes_online.get(node_id) {
+                    node.set_availability(NodeAvailability::Active(UtilizationScore(
+                        utilization.utilization_score,
+                    )));
                     scheduler.node_upsert(node);
                 }
             }
@@ -336,6 +411,49 @@ impl Service {
         tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
     }
 
+    async fn initial_heartbeat_round<'a>(
+        &self,
+        node_ids: impl Iterator<Item = &'a NodeId>,
+    ) -> HashMap<NodeId, PageserverUtilization> {
+        assert!(!self.startup_complete.is_ready());
+
+        let all_nodes = {
+            let locked = self.inner.read().unwrap();
+            locked.nodes.clone()
+        };
+
+        let mut nodes_to_heartbeat = HashMap::new();
+        for node_id in node_ids {
+            match all_nodes.get(node_id) {
+                Some(node) => {
+                    nodes_to_heartbeat.insert(*node_id, node.clone());
+                }
+                None => {
+                    tracing::warn!("Node {node_id} was removed during start-up");
+                }
+            }
+        }
+
+        let res = self
+            .heartbeater
+            .heartbeat(Arc::new(nodes_to_heartbeat))
+            .await;
+
+        let mut online_nodes = HashMap::new();
+        if let Ok(deltas) = res {
+            for (node_id, status) in deltas.0 {
+                match status {
+                    PageserverState::Available { utilization, .. } => {
+                        online_nodes.insert(node_id, utilization);
+                    }
+                    PageserverState::Offline => {}
+                }
+            }
+        }
+
+        online_nodes
+    }
+
     /// Used during [`Self::startup_reconcile`]: issue GETs to all nodes concurrently, with a deadline.
     ///
     /// The result includes only nodes which responded within the deadline
@@ -356,7 +474,7 @@ impl Service {
             node_list_futs.push({
                 async move {
                     tracing::info!("Scanning shards on node {node}...");
-                    let timeout = Duration::from_secs(5);
+                    let timeout = Duration::from_secs(1);
                     let response = node
                         .with_client_retries(
                             |client| async move { client.list_location_config().await },
@@ -551,6 +669,56 @@ impl Service {
             }
         }
     }
+    #[instrument(skip_all)]
+    async fn spawn_heartbeat_driver(&self) {
+        self.startup_complete.clone().wait().await;
+
+        const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(5);
+
+        let mut interval = tokio::time::interval(HEARTBEAT_INTERVAL);
+        while !self.cancel.is_cancelled() {
+            tokio::select! {
+              _ = interval.tick() => { }
+              _ = self.cancel.cancelled() => return
+            };
+
+            let nodes = {
+                let locked = self.inner.read().unwrap();
+                locked.nodes.clone()
+            };
+
+            let res = self.heartbeater.heartbeat(nodes).await;
+            if let Ok(deltas) = res {
+                for (node_id, state) in deltas.0 {
+                    let new_availability = match state {
+                        PageserverState::Available { utilization, .. } => NodeAvailability::Active(
+                            UtilizationScore(utilization.utilization_score),
+                        ),
+                        PageserverState::Offline => NodeAvailability::Offline,
+                    };
+                    let res = self
+                        .node_configure(node_id, Some(new_availability), None)
+                        .await;
+
+                    match res {
+                        Ok(()) => {}
+                        Err(ApiError::NotFound(_)) => {
+                            // This should be rare, but legitimate since the heartbeats are done
+                            // on a snapshot of the nodes.
+                            tracing::info!("Node {} was not found after heartbeat round", node_id);
+                        }
+                        Err(err) => {
+                            tracing::error!(
+                                "Failed to update node {} after heartbeat round: {}",
+                                node_id,
+                                err
+                            );
+                        }
+                    }
+                }
+            }
+        }
+    }
 
     /// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation
     /// was successful, this will update the observed state of the tenant such that subsequent
@@ -591,7 +759,19 @@ impl Service {
                 tenant.waiter.advance(result.sequence);
             }
             Err(e) => {
-                tracing::warn!("Reconcile error: {}", e);
+                match e {
+                    ReconcileError::Cancel => {
+                        tracing::info!("Reconciler was cancelled");
+                    }
+                    ReconcileError::Remote(mgmt_api::Error::Cancelled) => {
+                        // This might be due to the reconciler getting cancelled, or it might
+                        // be due to the `Node` being marked offline.
+                        tracing::info!("Reconciler cancelled during pageserver API call");
+                    }
+                    _ => {
+                        tracing::warn!("Reconcile error: {}", e);
+                    }
+                }
 
                 // Ordering: populate last_error before advancing error_seq,
                 // so that waiters will see the correct error after waiting.
@@ -627,8 +807,52 @@ impl Service {
         }
     }
 
+    async fn process_aborts(
+        &self,
+        mut abort_rx: tokio::sync::mpsc::UnboundedReceiver<TenantShardSplitAbort>,
+    ) {
+        loop {
+            // Wait for the next result, or for cancellation
+            let op = tokio::select! {
+                r = abort_rx.recv() => {
+                    match r {
+                        Some(op) => {op},
+                        None => {break;}
+                    }
+                }
+                _ = self.cancel.cancelled() => {
+                    break;
+                }
+            };
+
+            // Retry until shutdown: we must keep this request object alive until it is properly
+            // processed, as it holds a lock guard that prevents other operations trying to do things
+            // to the tenant while it is in a weird part-split state.
+            while !self.cancel.is_cancelled() {
+                match self.abort_tenant_shard_split(&op).await {
+                    Ok(_) => break,
+                    Err(e) => {
+                        tracing::warn!(
+                            "Failed to abort shard split on {}, will retry: {e}",
+                            op.tenant_id
+                        );
+
+                        // If a node is unavailable, we hope that it has been properly marked Offline
+                        // when we retry, so that the abort op will succeed.  If the abort op is failing
+                        // for some other reason, we will keep retrying forever, or until a human notices
+                        // and does something about it (either fixing a pageserver or restarting the controller).
+                        tokio::time::timeout(Duration::from_secs(5), self.cancel.cancelled())
+                            .await
+                            .ok();
+                    }
+                }
+            }
+        }
+    }
+
     pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
         let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
+        let (abort_tx, abort_rx) = tokio::sync::mpsc::unbounded_channel();
 
         tracing::info!("Loading nodes from database...");
         let nodes = persistence
@@ -641,12 +865,62 @@ impl Service {
         tracing::info!("Loaded {} nodes from database.", nodes.len());
 
         tracing::info!("Loading shards from database...");
-        let tenant_shard_persistence = persistence.list_tenant_shards().await?;
+        let mut tenant_shard_persistence = persistence.list_tenant_shards().await?;
         tracing::info!(
             "Loaded {} shards from database.",
             tenant_shard_persistence.len()
         );
 
+        // If any shard splits were in progress, reset the database state to abort them
+        let mut tenant_shard_count_min_max: HashMap<TenantId, (ShardCount, ShardCount)> =
+            HashMap::new();
+        for tsp in &mut tenant_shard_persistence {
+            let shard = tsp.get_shard_identity()?;
+            let tenant_shard_id = tsp.get_tenant_shard_id()?;
+            let entry = tenant_shard_count_min_max
+                .entry(tenant_shard_id.tenant_id)
+                .or_insert_with(|| (shard.count, shard.count));
+            entry.0 = std::cmp::min(entry.0, shard.count);
+            entry.1 = std::cmp::max(entry.1, shard.count);
+        }
+
+        for (tenant_id, (count_min, count_max)) in tenant_shard_count_min_max {
+            if count_min != count_max {
+                // Aborting the split in the database and dropping the child shards is sufficient: the reconciliation in
+                // [`Self::startup_reconcile`] will implicitly drop the child shards on remote pageservers, or they'll
+                // be dropped later in [`Self::node_activate_reconcile`] if it isn't available right now.
+                tracing::info!("Aborting shard split {tenant_id} {count_min:?} -> {count_max:?}");
+                let abort_status = persistence.abort_shard_split(tenant_id, count_max).await?;
+
+                // We may never see the Complete status here: if the split was complete, we wouldn't have
+                // identified this tenant has having mismatching min/max counts.
+                assert!(matches!(abort_status, AbortShardSplitStatus::Aborted));
+
+                // Clear the splitting status in-memory, to reflect that we just aborted in the database
+                tenant_shard_persistence.iter_mut().for_each(|tsp| {
+                    // Set idle split state on those shards that we will retain.
+                    let tsp_tenant_id = TenantId::from_str(tsp.tenant_id.as_str()).unwrap();
+                    if tsp_tenant_id == tenant_id
+                        && tsp.get_shard_identity().unwrap().count == count_min
+                    {
+                        tsp.splitting = SplitState::Idle;
+                    } else if tsp_tenant_id == tenant_id {
+                        // Leave the splitting state on the child shards: this will be used next to
+                        // drop them.
+                        tracing::info!(
+                            "Shard {tsp_tenant_id} will be dropped after shard split abort",
+                        );
+                    }
+                });
+
+                // Drop shards for this tenant which we didn't just mark idle (i.e. child shards of the aborted split)
+                tenant_shard_persistence.retain(|tsp| {
+                    TenantId::from_str(tsp.tenant_id.as_str()).unwrap() != tenant_id
+                        || tsp.splitting == SplitState::Idle
+                });
+            }
+        }
+
         let mut tenants = BTreeMap::new();
 
         let mut scheduler = Scheduler::new(nodes.values());
@@ -676,21 +950,8 @@ impl Service {
             }
         }
         for tsp in tenant_shard_persistence {
-            let tenant_shard_id = TenantShardId {
-                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
-                shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount::new(tsp.shard_count as u8),
-            };
-            let shard_identity = if tsp.shard_count == 0 {
-                ShardIdentity::unsharded()
-            } else {
-                ShardIdentity::new(
-                    ShardNumber(tsp.shard_number as u8),
-                    ShardCount::new(tsp.shard_count as u8),
-                    ShardStripeSize(tsp.shard_stripe_size as u32),
-                )?
-            };
-
+            let tenant_shard_id = tsp.get_tenant_shard_id()?;
+            let shard_identity = tsp.get_shard_identity()?;
             // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
             // it with what we can infer: the node for which a generation was most recently issued.
             let mut intent = IntentState::new();
@@ -720,6 +981,12 @@ impl Service {
 
         let (startup_completion, startup_complete) = utils::completion::channel();
 
+        let cancel = CancellationToken::new();
+        let heartbeater = Heartbeater::new(
+            config.jwt_token.clone(),
+            config.max_unavailable_interval,
+            cancel.clone(),
+        );
         let this = Arc::new(Self {
             inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
                 nodes, tenants, scheduler,
@@ -728,9 +995,13 @@ impl Service {
             persistence,
             compute_hook: Arc::new(ComputeHook::new(config)),
             result_tx,
+            heartbeater,
+            abort_tx,
             startup_complete: startup_complete.clone(),
-            cancel: CancellationToken::new(),
+            cancel,
             gate: Gate::default(),
+            tenant_op_locks: Default::default(),
+            node_op_locks: Default::default(),
         });
 
         let result_task_this = this.clone();
@@ -741,6 +1012,33 @@ impl Service {
             }
         });
 
+        tokio::task::spawn({
+            let this = this.clone();
+            async move {
+                // Block shutdown until we're done (we must respect self.cancel)
+                if let Ok(_gate) = this.gate.enter() {
+                    this.process_aborts(abort_rx).await
+                }
+            }
+        });
+
+        tokio::task::spawn({
+            let this = this.clone();
+            async move {
+                if let Ok(_gate) = this.gate.enter() {
+                    loop {
+                        tokio::select! {
+                            _ = this.cancel.cancelled() => {
+                                break;
+                            },
+                            _ = tokio::time::sleep(Duration::from_secs(60)) => {}
+                        };
+                        this.tenant_op_locks.housekeeping();
+                    }
+                }
+            }
+        });
+
         tokio::task::spawn({
             let this = this.clone();
             // We will block the [`Service::startup_complete`] barrier until [`Self::startup_reconcile`]
@@ -753,13 +1051,28 @@ impl Service {
                 };
 
                 this.startup_reconcile().await;
-
                 drop(startup_completion);
+            }
+        });
 
+        tokio::task::spawn({
+            let this = this.clone();
+            let startup_complete = startup_complete.clone();
+            async move {
+                startup_complete.wait().await;
                 this.background_reconcile().await;
             }
         });
 
+        tokio::task::spawn({
+            let this = this.clone();
+            let startup_complete = startup_complete.clone();
+            async move {
+                startup_complete.wait().await;
+                this.spawn_heartbeat_driver().await;
+            }
+        });
+
         Ok(this)
     }
 
@@ -782,7 +1095,7 @@ impl Service {
                 shard_stripe_size: 0,
                 generation: Some(0),
                 generation_pageserver: None,
-                placement_policy: serde_json::to_string(&PlacementPolicy::Single).unwrap(),
+                placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
                 config: serde_json::to_string(&TenantConfig::default()).unwrap(),
                 splitting: SplitState::default(),
             };
@@ -809,7 +1122,7 @@ impl Service {
                         TenantState::new(
                             attach_req.tenant_shard_id,
                             ShardIdentity::unsharded(),
-                            PlacementPolicy::Single,
+                            PlacementPolicy::Attached(0),
                         ),
                     );
                     tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
@@ -818,11 +1131,37 @@ impl Service {
         }
 
         let new_generation = if let Some(req_node_id) = attach_req.node_id {
-            Some(
-                self.persistence
-                    .increment_generation(attach_req.tenant_shard_id, req_node_id)
-                    .await?,
-            )
+            let maybe_tenant_conf = {
+                let locked = self.inner.write().unwrap();
+                locked
+                    .tenants
+                    .get(&attach_req.tenant_shard_id)
+                    .map(|t| t.config.clone())
+            };
+
+            match maybe_tenant_conf {
+                Some(conf) => {
+                    let new_generation = self
+                        .persistence
+                        .increment_generation(attach_req.tenant_shard_id, req_node_id)
+                        .await?;
+
+                    // Persist the placement policy update. This is required
+                    // when we reattaching a detached tenant.
+                    self.persistence
+                        .update_tenant_shard(
+                            attach_req.tenant_shard_id,
+                            PlacementPolicy::Attached(0),
+                            conf,
+                            None,
+                        )
+                        .await?;
+                    Some(new_generation)
+                }
+                None => {
+                    anyhow::bail!("Attach hook handling raced with tenant removal")
+                }
+            }
         } else {
             self.persistence.detach(attach_req.tenant_shard_id).await?;
             None
@@ -837,6 +1176,7 @@ impl Service {
 
         if let Some(new_generation) = new_generation {
             tenant_state.generation = Some(new_generation);
+            tenant_state.policy = PlacementPolicy::Attached(0);
         } else {
             // This is a detach notification.  We must update placement policy to avoid re-attaching
             // during background scheduling/reconciliation, or during storage controller restart.
@@ -889,6 +1229,7 @@ impl Service {
                             tenant_state.generation.unwrap(),
                             &tenant_state.shard,
                             &tenant_state.config,
+                            false,
                         )),
                     },
                 )]);
@@ -918,6 +1259,118 @@ impl Service {
         }
     }
 
+    // When the availability state of a node transitions to active, we must do a full reconciliation
+    // of LocationConfigs on that node.  This is because while a node was offline:
+    // - we might have proceeded through startup_reconcile without checking for extraneous LocationConfigs on this node
+    // - aborting a tenant shard split might have left rogue child shards behind on this node.
+    //
+    // This function must complete _before_ setting a `Node` to Active: once it is set to Active, other
+    // Reconcilers might communicate with the node, and these must not overlap with the work we do in
+    // this function.
+    //
+    // The reconciliation logic in here is very similar to what [`Self::startup_reconcile`] does, but
+    // for written for a single node rather than as a batch job for all nodes.
+    #[tracing::instrument(skip_all, fields(node_id=%node.get_id()))]
+    async fn node_activate_reconcile(
+        &self,
+        mut node: Node,
+        _lock: &OwnedRwLockWriteGuard<()>,
+    ) -> Result<(), ApiError> {
+        // This Node is a mutable local copy: we will set it active so that we can use its
+        // API client to reconcile with the node.  The Node in [`Self::nodes`] will get updated
+        // later.
+        node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+
+        let configs = match node
+            .with_client_retries(
+                |client| async move { client.list_location_config().await },
+                &self.config.jwt_token,
+                1,
+                5,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await
+        {
+            None => {
+                // We're shutting down (the Node's cancellation token can't have fired, because
+                // we're the only scope that has a reference to it, and we didn't fire it).
+                return Err(ApiError::ShuttingDown);
+            }
+            Some(Err(e)) => {
+                // This node didn't succeed listing its locations: it may not proceed to active state
+                // as it is apparently unavailable.
+                return Err(ApiError::PreconditionFailed(
+                    format!("Failed to query node location configs, cannot activate ({e})").into(),
+                ));
+            }
+            Some(Ok(configs)) => configs,
+        };
+        tracing::info!("Loaded {} LocationConfigs", configs.tenant_shards.len());
+
+        let mut cleanup = Vec::new();
+        {
+            let mut locked = self.inner.write().unwrap();
+
+            for (tenant_shard_id, observed_loc) in configs.tenant_shards {
+                let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
+                    cleanup.push(tenant_shard_id);
+                    continue;
+                };
+                tenant_state
+                    .observed
+                    .locations
+                    .insert(node.get_id(), ObservedStateLocation { conf: observed_loc });
+            }
+        }
+
+        for tenant_shard_id in cleanup {
+            tracing::info!("Detaching {tenant_shard_id}");
+            match node
+                .with_client_retries(
+                    |client| async move {
+                        let config = LocationConfig {
+                            mode: LocationConfigMode::Detached,
+                            generation: None,
+                            secondary_conf: None,
+                            shard_number: tenant_shard_id.shard_number.0,
+                            shard_count: tenant_shard_id.shard_count.literal(),
+                            shard_stripe_size: 0,
+                            tenant_conf: models::TenantConfig::default(),
+                        };
+                        client
+                            .location_config(tenant_shard_id, config, None, false)
+                            .await
+                    },
+                    &self.config.jwt_token,
+                    1,
+                    5,
+                    SHORT_RECONCILE_TIMEOUT,
+                    &self.cancel,
+                )
+                .await
+            {
+                None => {
+                    // We're shutting down (the Node's cancellation token can't have fired, because
+                    // we're the only scope that has a reference to it, and we didn't fire it).
+                    return Err(ApiError::ShuttingDown);
+                }
+                Some(Err(e)) => {
+                    // Do not let the node proceed to Active state if it is not responsive to requests
+                    // to detach.  This could happen if e.g. a shutdown bug in the pageserver is preventing
+                    // detach completing: we should not let this node back into the set of nodes considered
+                    // okay for scheduling.
+                    return Err(ApiError::Conflict(format!(
+                        "Node {node} failed to detach {tenant_shard_id}: {e}"
+                    )));
+                }
+                Some(Ok(_)) => {}
+            };
+        }
+
+        Ok(())
+    }
+
     pub(crate) async fn re_attach(
         &self,
         reattach_req: ReAttachRequest,
@@ -926,15 +1379,6 @@ impl Service {
             self.node_register(register_req).await?;
         }
 
-        // Take a re-attach as indication that the node is available: this is a precursor to proper
-        // heartbeating in https://github.com/neondatabase/neon/issues/6844
-        self.node_configure(NodeConfigureRequest {
-            node_id: reattach_req.node_id,
-            availability: Some(NodeAvailability::Active),
-            scheduling: None,
-        })
-        .await?;
-
         // Ordering: we must persist generation number updates before making them visible in the in-memory state
         let incremented_generations = self.persistence.re_attach(reattach_req.node_id).await?;
 
@@ -946,6 +1390,7 @@ impl Service {
 
         // Apply the updated generation to our in-memory state
         let mut locked = self.inner.write().unwrap();
+        let (nodes, tenants, scheduler) = locked.parts_mut();
 
         let mut response = ReAttachResponse {
             tenants: Vec::new(),
@@ -957,7 +1402,7 @@ impl Service {
                 gen: new_gen.into().unwrap(),
             });
             // Apply the new generation number to our in-memory state
-            let shard_state = locked.tenants.get_mut(&tenant_shard_id);
+            let shard_state = tenants.get_mut(&tenant_shard_id);
             let Some(shard_state) = shard_state else {
                 // Not fatal.  This edge case requires a re-attach to happen
                 // between inserting a new tenant shard in to the database, and updating our in-memory
@@ -1008,6 +1453,26 @@ impl Service {
             // request in flight over the network: TODO handle that by making location_conf API refuse
             // to go backward in generations.
         }
+
+        // We consider a node Active once we have composed a re-attach response, but we
+        // do not call [`Self::node_activate_reconcile`]: the handling of the re-attach response
+        // implicitly synchronizes the LocationConfigs on the node.
+        //
+        // Setting a node active unblocks any Reconcilers that might write to the location config API,
+        // but those requests will not be accepted by the node until it has finished processing
+        // the re-attach response.
+        if let Some(node) = nodes.get(&reattach_req.node_id) {
+            if !node.is_available() {
+                let mut new_nodes = (**nodes).clone();
+                if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) {
+                    node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+                    scheduler.node_upsert(node);
+                }
+                let new_nodes = Arc::new(new_nodes);
+                *nodes = new_nodes;
+            }
+        }
+
         Ok(response)
     }
 
@@ -1048,6 +1513,12 @@ impl Service {
         &self,
         create_req: TenantCreateRequest,
     ) -> Result<TenantCreateResponse, ApiError> {
+        // Exclude any concurrent attempts to create/access the same tenant ID
+        let _tenant_lock = self
+            .tenant_op_locks
+            .exclusive(create_req.new_tenant_id.tenant_id)
+            .await;
+
         let (response, waiters) = self.do_tenant_create(create_req).await?;
 
         self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
@@ -1058,11 +1529,11 @@ impl Service {
         &self,
         create_req: TenantCreateRequest,
     ) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
-        // As a default, single is convenient for tests that don't choose a policy.
         let placement_policy = create_req
             .placement_policy
             .clone()
-            .unwrap_or(PlacementPolicy::Single);
+            // As a default, zero secondaries is convenient for tests that don't choose a policy.
+            .unwrap_or(PlacementPolicy::Attached(0));
 
         // This service expects to handle sharding itself: it is an error to try and directly create
         // a particular shard here.
@@ -1262,6 +1733,7 @@ impl Service {
         let mut updates = Vec::new();
         let mut locked = self.inner.write().unwrap();
         let (nodes, tenants, _scheduler) = locked.parts_mut();
+        let tenant_shard_id = TenantShardId::unsharded(tenant_id);
 
         // Use location config mode as an indicator of policy.
         let placement_policy = match req.config.mode {
@@ -1271,11 +1743,11 @@ impl Service {
             | LocationConfigMode::AttachedSingle
             | LocationConfigMode::AttachedStale => {
                 if nodes.len() > 1 {
-                    PlacementPolicy::Double(1)
+                    PlacementPolicy::Attached(1)
                 } else {
                     // Convenience for dev/test: if we just have one pageserver, import
-                    // tenants into Single mode so that scheduling will succeed.
-                    PlacementPolicy::Single
+                    // tenants into non-HA mode so that scheduling will succeed.
+                    PlacementPolicy::Attached(0)
                 }
             }
         };
@@ -1326,12 +1798,10 @@ impl Service {
             TenantCreateOrUpdate::Create(
                 // Synthesize a creation request
                 TenantCreateRequest {
-                    new_tenant_id: TenantShardId::unsharded(tenant_id),
+                    new_tenant_id: tenant_shard_id,
                     generation,
                     shard_parameters: ShardParameters {
-                        // Must preserve the incoming shard_count do distinguish unsharded (0)
-                        // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
-                        count: req.tenant_id.shard_count,
+                        count: tenant_shard_id.shard_count,
                         // We only import un-sharded or single-sharded tenants, so stripe
                         // size can be made up arbitrarily here.
                         stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
@@ -1360,17 +1830,23 @@ impl Service {
     /// - Call with mode Detached to switch to PolicyMode::Detached
     pub(crate) async fn tenant_location_config(
         &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
         req: TenantLocationConfigRequest,
     ) -> Result<TenantLocationConfigResponse, ApiError> {
-        if !req.tenant_id.is_unsharded() {
+        // We require an exclusive lock, because we are updating both persistent and in-memory state
+        let _tenant_lock = self
+            .tenant_op_locks
+            .exclusive(tenant_shard_id.tenant_id)
+            .await;
+
+        if !tenant_shard_id.is_unsharded() {
             return Err(ApiError::BadRequest(anyhow::anyhow!(
                 "This API is for importing single-sharded or unsharded tenants"
             )));
         }
 
         // First check if this is a creation or an update
-        let create_or_update = self.tenant_location_config_prepare(tenant_id, req);
+        let create_or_update = self.tenant_location_config_prepare(tenant_shard_id.tenant_id, req);
 
         let mut result = TenantLocationConfigResponse {
             shards: Vec::new(),
@@ -1476,6 +1952,9 @@ impl Service {
     }
 
     pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> {
+        // We require an exclusive lock, because we are updating persistent and in-memory state
+        let _tenant_lock = self.tenant_op_locks.exclusive(req.tenant_id).await;
+
         let tenant_id = req.tenant_id;
         let config = req.config;
 
@@ -1557,6 +2036,8 @@ impl Service {
         timestamp: Cow<'_, str>,
         done_if_after: Cow<'_, str>,
     ) -> Result<(), ApiError> {
+        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+
         let node = {
             let locked = self.inner.read().unwrap();
             // Just a sanity check to prevent misuse: the API expects that the tenant is fully
@@ -1641,7 +2122,10 @@ impl Service {
     pub(crate) async fn tenant_secondary_download(
         &self,
         tenant_id: TenantId,
-    ) -> Result<(), ApiError> {
+        wait: Option<Duration>,
+    ) -> Result<(StatusCode, SecondaryProgress), ApiError> {
+        let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await;
+
         // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to
         let targets = {
             let locked = self.inner.read().unwrap();
@@ -1662,35 +2146,76 @@ impl Service {
             targets
         };
 
-        // TODO: this API, and the underlying pageserver API, should take a timeout argument so that for long running
-        // downloads, they can return a clean 202 response instead of the HTTP client timing out.
-
         // Issue concurrent requests to all shards' locations
         let mut futs = FuturesUnordered::new();
         for (tenant_shard_id, node) in targets {
             let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
             futs.push(async move {
-                let result = client.tenant_secondary_download(tenant_shard_id).await;
-                (result, node)
+                let result = client
+                    .tenant_secondary_download(tenant_shard_id, wait)
+                    .await;
+                (result, node, tenant_shard_id)
             })
         }
 
         // Handle any errors returned by pageservers.  This includes cases like this request racing with
         // a scheduling operation, such that the tenant shard we're calling doesn't exist on that pageserver any more, as
         // well as more general cases like 503s, 500s, or timeouts.
-        while let Some((result, node)) = futs.next().await {
-            let Err(e) = result else { continue };
-
-            // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever
-            // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache
-            // than they had hoped for.
-            tracing::warn!("Ignoring tenant secondary download error from pageserver {node}: {e}",);
+        let mut aggregate_progress = SecondaryProgress::default();
+        let mut aggregate_status: Option<StatusCode> = None;
+        let mut error: Option<mgmt_api::Error> = None;
+        while let Some((result, node, tenant_shard_id)) = futs.next().await {
+            match result {
+                Err(e) => {
+                    // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever
+                    // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache
+                    // than they had hoped for.
+                    tracing::warn!("Secondary download error from pageserver {node}: {e}",);
+                    error = Some(e)
+                }
+                Ok((status_code, progress)) => {
+                    tracing::info!(%tenant_shard_id, "Shard status={status_code} progress: {progress:?}");
+                    aggregate_progress.layers_downloaded += progress.layers_downloaded;
+                    aggregate_progress.layers_total += progress.layers_total;
+                    aggregate_progress.bytes_downloaded += progress.bytes_downloaded;
+                    aggregate_progress.bytes_total += progress.bytes_total;
+                    aggregate_progress.heatmap_mtime =
+                        std::cmp::max(aggregate_progress.heatmap_mtime, progress.heatmap_mtime);
+                    aggregate_status = match aggregate_status {
+                        None => Some(status_code),
+                        Some(StatusCode::OK) => Some(status_code),
+                        Some(cur) => {
+                            // Other status codes (e.g. 202) -- do not overwrite.
+                            Some(cur)
+                        }
+                    };
+                }
+            }
         }
 
-        Ok(())
+        // If any of the shards return 202, indicate our result as 202.
+        match aggregate_status {
+            None => {
+                match error {
+                    Some(e) => {
+                        // No successes, and an error: surface it
+                        Err(ApiError::Conflict(format!("Error from pageserver: {e}")))
+                    }
+                    None => {
+                        // No shards found
+                        Err(ApiError::NotFound(
+                            anyhow::anyhow!("Tenant {} not found", tenant_id).into(),
+                        ))
+                    }
+                }
+            }
+            Some(aggregate_status) => Ok((aggregate_status, aggregate_progress)),
+        }
     }
 
     pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
+        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+
         self.ensure_attached_wait(tenant_id).await?;
 
         // TODO: refactor into helper
@@ -1787,10 +2312,10 @@ impl Service {
             create_req.new_timeline_id,
         );
 
+        let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await;
+
         self.ensure_attached_wait(tenant_id).await?;
 
-        // TODO: refuse to do this if shard splitting is in progress
-        // (https://github.com/neondatabase/neon/issues/6676)
         let mut targets = {
             let locked = self.inner.read().unwrap();
             let mut targets = Vec::new();
@@ -1912,11 +2437,10 @@ impl Service {
         timeline_id: TimelineId,
     ) -> Result<StatusCode, ApiError> {
         tracing::info!("Deleting timeline {}/{}", tenant_id, timeline_id,);
+        let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await;
 
         self.ensure_attached_wait(tenant_id).await?;
 
-        // TODO: refuse to do this if shard splitting is in progress
-        // (https://github.com/neondatabase/neon/issues/6676)
         let mut targets = {
             let locked = self.inner.read().unwrap();
             let mut targets = Vec::new();
@@ -2041,9 +2565,6 @@ impl Service {
         let locked = self.inner.read().unwrap();
         tracing::info!("Locating shards for tenant {tenant_id}");
 
-        // Take a snapshot of pageservers
-        let pageservers = locked.nodes.clone();
-
         let mut result = Vec::new();
         let mut shard_params: Option<ShardParameters> = None;
 
@@ -2057,7 +2578,8 @@ impl Service {
                         "Cannot locate a tenant that is not attached"
                     )))?;
 
-            let node = pageservers
+            let node = locked
+                .nodes
                 .get(&node_id)
                 .expect("Pageservers may not be deleted while referenced");
 
@@ -2105,21 +2627,360 @@ impl Service {
         })
     }
 
+    pub(crate) fn tenant_describe(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<TenantDescribeResponse, ApiError> {
+        let locked = self.inner.read().unwrap();
+
+        let mut shard_zero = None;
+        let mut shards = Vec::new();
+
+        for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+        {
+            if tenant_shard_id.is_zero() {
+                shard_zero = Some(shard);
+            }
+
+            let response_shard = TenantDescribeResponseShard {
+                tenant_shard_id: *tenant_shard_id,
+                node_attached: *shard.intent.get_attached(),
+                node_secondary: shard.intent.get_secondary().to_vec(),
+                last_error: shard.last_error.lock().unwrap().clone(),
+                is_reconciling: shard.reconciler.is_some(),
+                is_pending_compute_notification: shard.pending_compute_notification,
+                is_splitting: matches!(shard.splitting, SplitState::Splitting),
+            };
+            shards.push(response_shard);
+        }
+
+        let Some(shard_zero) = shard_zero else {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant {tenant_id} not found").into(),
+            ));
+        };
+
+        Ok(TenantDescribeResponse {
+            shards,
+            stripe_size: shard_zero.shard.stripe_size,
+            policy: shard_zero.policy.clone(),
+            config: shard_zero.config.clone(),
+        })
+    }
+
+    #[instrument(skip_all, fields(tenant_id=%op.tenant_id))]
+    async fn abort_tenant_shard_split(
+        &self,
+        op: &TenantShardSplitAbort,
+    ) -> Result<(), TenantShardSplitAbortError> {
+        // Cleaning up a split:
+        // - Parent shards are not destroyed during a split, just detached.
+        // - Failed pageserver split API calls can leave the remote node with just the parent attached,
+        //   just the children attached, or both.
+        //
+        // Therefore our work to do is to:
+        // 1. Clean up storage controller's internal state to just refer to parents, no children
+        // 2. Call out to pageservers to ensure that children are detached
+        // 3. Call out to pageservers to ensure that parents are attached.
+        //
+        // Crash safety:
+        // - If the storage controller stops running during this cleanup *after* clearing the splitting state
+        //   from our database, then [`Self::startup_reconcile`] will regard child attachments as garbage
+        //   and detach them.
+        // - TODO: If the storage controller stops running during this cleanup *before* clearing the splitting state
+        //   from our database, then we will re-enter this cleanup routine on startup.
+
+        let TenantShardSplitAbort {
+            tenant_id,
+            new_shard_count,
+            new_stripe_size,
+            ..
+        } = op;
+
+        // First abort persistent state, if any exists.
+        match self
+            .persistence
+            .abort_shard_split(*tenant_id, *new_shard_count)
+            .await?
+        {
+            AbortShardSplitStatus::Aborted => {
+                // Proceed to roll back any child shards created on pageservers
+            }
+            AbortShardSplitStatus::Complete => {
+                // The split completed (we might hit that path if e.g. our database transaction
+                // to write the completion landed in the database, but we dropped connection
+                // before seeing the result).
+                //
+                // We must update in-memory state to reflect the successful split.
+                self.tenant_shard_split_commit_inmem(
+                    *tenant_id,
+                    *new_shard_count,
+                    *new_stripe_size,
+                );
+                return Ok(());
+            }
+        }
+
+        // Clean up in-memory state, and accumulate the list of child locations that need detaching
+        let detach_locations: Vec<(Node, TenantShardId)> = {
+            let mut detach_locations = Vec::new();
+            let mut locked = self.inner.write().unwrap();
+            let (nodes, tenants, _scheduler) = locked.parts_mut();
+
+            for (tenant_shard_id, shard) in
+                tenants.range_mut(TenantShardId::tenant_range(op.tenant_id))
+            {
+                if shard.shard.count == op.new_shard_count {
+                    // Surprising: the phase of [`Self::do_tenant_shard_split`] which inserts child shards in-memory
+                    // is infallible, so if we got an error we shouldn't have got that far.
+                    tracing::warn!(
+                        "During split abort, child shard {tenant_shard_id} found in-memory"
+                    );
+                    continue;
+                }
+
+                // Add the children of this shard to this list of things to detach
+                if let Some(node_id) = shard.intent.get_attached() {
+                    for child_id in tenant_shard_id.split(*new_shard_count) {
+                        detach_locations.push((
+                            nodes
+                                .get(node_id)
+                                .expect("Intent references nonexistent node")
+                                .clone(),
+                            child_id,
+                        ));
+                    }
+                } else {
+                    tracing::warn!(
+                        "During split abort, shard {tenant_shard_id} has no attached location"
+                    );
+                }
+
+                tracing::info!("Restoring parent shard {tenant_shard_id}");
+                shard.splitting = SplitState::Idle;
+                self.maybe_reconcile_shard(shard, nodes);
+            }
+
+            // We don't expect any new_shard_count shards to exist here, but drop them just in case
+            tenants.retain(|_id, s| s.shard.count != *new_shard_count);
+
+            detach_locations
+        };
+
+        for (node, child_id) in detach_locations {
+            if !node.is_available() {
+                // An unavailable node cannot be cleaned up now: to avoid blocking forever, we will permit this, and
+                // rely on the reconciliation that happens when a node transitions to Active to clean up. Since we have
+                // removed child shards from our in-memory state and database, the reconciliation will implicitly remove
+                // them from the node.
+                tracing::warn!("Node {node} unavailable, can't clean up during split abort. It will be cleaned up when it is reactivated.");
+                continue;
+            }
+
+            // Detach the remote child.  If the pageserver split API call is still in progress, this call will get
+            // a 503 and retry, up to our limit.
+            tracing::info!("Detaching {child_id} on {node}...");
+            match node
+                .with_client_retries(
+                    |client| async move {
+                        let config = LocationConfig {
+                            mode: LocationConfigMode::Detached,
+                            generation: None,
+                            secondary_conf: None,
+                            shard_number: child_id.shard_number.0,
+                            shard_count: child_id.shard_count.literal(),
+                            // Stripe size and tenant config don't matter when detaching
+                            shard_stripe_size: 0,
+                            tenant_conf: TenantConfig::default(),
+                        };
+
+                        client.location_config(child_id, config, None, false).await
+                    },
+                    &self.config.jwt_token,
+                    1,
+                    10,
+                    Duration::from_secs(5),
+                    &self.cancel,
+                )
+                .await
+            {
+                Some(Ok(_)) => {}
+                Some(Err(e)) => {
+                    // We failed to communicate with the remote node.  This is problematic: we may be
+                    // leaving it with a rogue child shard.
+                    tracing::warn!(
+                        "Failed to detach child {child_id} from node {node} during abort"
+                    );
+                    return Err(e.into());
+                }
+                None => {
+                    // Cancellation: we were shutdown or the node went offline. Shutdown is fine, we'll
+                    // clean up on restart. The node going offline requires a retry.
+                    return Err(TenantShardSplitAbortError::Unavailable);
+                }
+            };
+        }
+
+        tracing::info!("Successfully aborted split");
+        Ok(())
+    }
+
+    /// Infallible final stage of [`Self::tenant_shard_split`]: update the contents
+    /// of the tenant map to reflect the child shards that exist after the split.
+    fn tenant_shard_split_commit_inmem(
+        &self,
+        tenant_id: TenantId,
+        new_shard_count: ShardCount,
+        new_stripe_size: Option<ShardStripeSize>,
+    ) -> (
+        TenantShardSplitResponse,
+        Vec<(TenantShardId, NodeId, ShardStripeSize)>,
+    ) {
+        let mut response = TenantShardSplitResponse {
+            new_shards: Vec::new(),
+        };
+        let mut child_locations = Vec::new();
+        {
+            let mut locked = self.inner.write().unwrap();
+
+            let parent_ids = locked
+                .tenants
+                .range(TenantShardId::tenant_range(tenant_id))
+                .map(|(shard_id, _)| *shard_id)
+                .collect::<Vec<_>>();
+
+            let (_nodes, tenants, scheduler) = locked.parts_mut();
+            for parent_id in parent_ids {
+                let child_ids = parent_id.split(new_shard_count);
+
+                let (pageserver, generation, policy, parent_ident, config) = {
+                    let mut old_state = tenants
+                        .remove(&parent_id)
+                        .expect("It was present, we just split it");
+
+                    // A non-splitting state is impossible, because [`Self::tenant_shard_split`] holds
+                    // a TenantId lock and passes it through to [`TenantShardSplitAbort`] in case of cleanup:
+                    // nothing else can clear this.
+                    assert!(matches!(old_state.splitting, SplitState::Splitting));
+
+                    let old_attached = old_state.intent.get_attached().unwrap();
+                    old_state.intent.clear(scheduler);
+                    let generation = old_state.generation.expect("Shard must have been attached");
+                    (
+                        old_attached,
+                        generation,
+                        old_state.policy,
+                        old_state.shard,
+                        old_state.config,
+                    )
+                };
+
+                for child in child_ids {
+                    let mut child_shard = parent_ident;
+                    child_shard.number = child.shard_number;
+                    child_shard.count = child.shard_count;
+                    if let Some(stripe_size) = new_stripe_size {
+                        child_shard.stripe_size = stripe_size;
+                    }
+
+                    let mut child_observed: HashMap<NodeId, ObservedStateLocation> = HashMap::new();
+                    child_observed.insert(
+                        pageserver,
+                        ObservedStateLocation {
+                            conf: Some(attached_location_conf(
+                                generation,
+                                &child_shard,
+                                &config,
+                                matches!(policy, PlacementPolicy::Attached(n) if n > 0),
+                            )),
+                        },
+                    );
+
+                    let mut child_state = TenantState::new(child, child_shard, policy.clone());
+                    child_state.intent = IntentState::single(scheduler, Some(pageserver));
+                    child_state.observed = ObservedState {
+                        locations: child_observed,
+                    };
+                    child_state.generation = Some(generation);
+                    child_state.config = config.clone();
+
+                    // The child's TenantState::splitting is intentionally left at the default value of Idle,
+                    // as at this point in the split process we have succeeded and this part is infallible:
+                    // we will never need to do any special recovery from this state.
+
+                    child_locations.push((child, pageserver, child_shard.stripe_size));
+
+                    if let Err(e) = child_state.schedule(scheduler) {
+                        // This is not fatal, because we've implicitly already got an attached
+                        // location for the child shard.  Failure here just means we couldn't
+                        // find a secondary (e.g. because cluster is overloaded).
+                        tracing::warn!("Failed to schedule child shard {child}: {e}");
+                    }
+
+                    tenants.insert(child, child_state);
+                    response.new_shards.push(child);
+                }
+            }
+
+            (response, child_locations)
+        }
+    }
+
     pub(crate) async fn tenant_shard_split(
         &self,
         tenant_id: TenantId,
         split_req: TenantShardSplitRequest,
     ) -> Result<TenantShardSplitResponse, ApiError> {
+        // TODO: return 503 if we get stuck waiting for this lock
+        // (issue https://github.com/neondatabase/neon/issues/7108)
+        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+
+        let new_shard_count = ShardCount::new(split_req.new_shard_count);
+        let new_stripe_size = split_req.new_stripe_size;
+
+        // Validate the request and construct parameters.  This phase is fallible, but does not require
+        // rollback on errors, as it does no I/O and mutates no state.
+        let shard_split_params = match self.prepare_tenant_shard_split(tenant_id, split_req)? {
+            ShardSplitAction::NoOp(resp) => return Ok(resp),
+            ShardSplitAction::Split(params) => params,
+        };
+
+        // Execute this split: this phase mutates state and does remote I/O on pageservers.  If it fails,
+        // we must roll back.
+        let r = self
+            .do_tenant_shard_split(tenant_id, shard_split_params)
+            .await;
+
+        match r {
+            Ok(r) => Ok(r),
+            Err(e) => {
+                // Split might be part-done, we must do work to abort it.
+                tracing::warn!("Enqueuing background abort of split on {tenant_id}");
+                self.abort_tx
+                    .send(TenantShardSplitAbort {
+                        tenant_id,
+                        new_shard_count,
+                        new_stripe_size,
+                        _tenant_lock,
+                    })
+                    // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it.
+                    .ok();
+                Err(e)
+            }
+        }
+    }
+
+    fn prepare_tenant_shard_split(
+        &self,
+        tenant_id: TenantId,
+        split_req: TenantShardSplitRequest,
+    ) -> Result<ShardSplitAction, ApiError> {
+        fail::fail_point!("shard-split-validation", |_| Err(ApiError::BadRequest(
+            anyhow::anyhow!("failpoint")
+        )));
+
         let mut policy = None;
         let mut shard_ident = None;
-
-        // A parent shard which will be split
-        struct SplitTarget {
-            parent_id: TenantShardId,
-            node: Node,
-            child_ids: Vec<TenantShardId>,
-        }
-
         // Validate input, and calculate which shards we will create
         let (old_shard_count, targets) =
             {
@@ -2195,7 +3056,7 @@ impl Service {
 
                     // TODO: if any reconciliation is currently in progress for this shard, wait for it.
 
-                    targets.push(SplitTarget {
+                    targets.push(ShardSplitTarget {
                         parent_id: *tenant_shard_id,
                         node: node.clone(),
                         child_ids: tenant_shard_id
@@ -2205,9 +3066,9 @@ impl Service {
 
                 if targets.is_empty() {
                     if children_found.len() == split_req.new_shard_count as usize {
-                        return Ok(TenantShardSplitResponse {
+                        return Ok(ShardSplitAction::NoOp(TenantShardSplitResponse {
                             new_shards: children_found,
-                        });
+                        }));
                     } else {
                         // No shards found to split, and no existing children found: the
                         // tenant doesn't exist at all.
@@ -2229,19 +3090,45 @@ impl Service {
             if shard_ident.count.count() > 1 && shard_ident.stripe_size != new_stripe_size {
                 return Err(ApiError::BadRequest(anyhow::anyhow!("Attempted to change stripe size ({:?}->{new_stripe_size:?}) on a tenant with multiple shards", shard_ident.stripe_size)));
             }
+
             shard_ident.stripe_size = new_stripe_size;
+            tracing::info!("applied  stripe size {}", shard_ident.stripe_size.0);
             shard_ident
         } else {
             shard_ident.unwrap()
         };
         let policy = policy.unwrap();
 
+        Ok(ShardSplitAction::Split(ShardSplitParams {
+            old_shard_count,
+            new_shard_count: ShardCount::new(split_req.new_shard_count),
+            new_stripe_size: split_req.new_stripe_size,
+            targets,
+            policy,
+            shard_ident,
+        }))
+    }
+
+    async fn do_tenant_shard_split(
+        &self,
+        tenant_id: TenantId,
+        params: ShardSplitParams,
+    ) -> Result<TenantShardSplitResponse, ApiError> {
         // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
         // request could occur here, deleting or mutating the tenant.  begin_shard_split checks that the
         // parent shards exist as expected, but it would be neater to do the above pre-checks within the
         // same database transaction rather than pre-check in-memory and then maybe-fail the database write.
         // (https://github.com/neondatabase/neon/issues/6676)
 
+        let ShardSplitParams {
+            old_shard_count,
+            new_shard_count,
+            new_stripe_size,
+            targets,
+            policy,
+            shard_ident,
+        } = params;
+
         // Before creating any new child shards in memory or on the pageservers, persist them: this
         // enables us to ensure that we will always be able to clean up if something goes wrong.  This also
         // acts as the protection against two concurrent attempts to split: one of them will get a database
@@ -2254,6 +3141,11 @@ impl Service {
                 child_shard.number = child.shard_number;
                 child_shard.count = child.shard_count;
 
+                tracing::info!(
+                    "Create child shard persistence with stripe size {}",
+                    shard_ident.stripe_size.0
+                );
+
                 this_child_tsps.push(TenantShardPersistence {
                     tenant_id: child.tenant_id.to_string(),
                     shard_number: child.shard_number.0 as i32,
@@ -2292,6 +3184,9 @@ impl Service {
                 _ => return Err(ApiError::InternalServerError(e.into())),
             }
         }
+        fail::fail_point!("shard-split-post-begin", |_| Err(
+            ApiError::InternalServerError(anyhow::anyhow!("failpoint"))
+        ));
 
         // Now that I have persisted the splitting state, apply it in-memory.  This is infallible, so
         // callers may assume that if splitting is set in memory, then it was persisted, and if splitting
@@ -2301,20 +3196,21 @@ impl Service {
             for target in &targets {
                 if let Some(parent_shard) = locked.tenants.get_mut(&target.parent_id) {
                     parent_shard.splitting = SplitState::Splitting;
+                    // Put the observed state to None, to reflect that it is indeterminate once we start the
+                    // split operation.
+                    parent_shard
+                        .observed
+                        .locations
+                        .insert(target.node.get_id(), ObservedStateLocation { conf: None });
                 }
             }
         }
 
-        // FIXME: we have now committed the shard split state to the database, so any subsequent
-        // failure needs to roll it back.  We will later wrap this function in logic to roll back
-        // the split if it fails.
-        // (https://github.com/neondatabase/neon/issues/6676)
-
         // TODO: issue split calls concurrently (this only matters once we're splitting
         // N>1 shards into M shards -- initially we're usually splitting 1 shard into N).
 
         for target in &targets {
-            let SplitTarget {
+            let ShardSplitTarget {
                 parent_id,
                 node,
                 child_ids,
@@ -2324,13 +3220,17 @@ impl Service {
                 .tenant_shard_split(
                     *parent_id,
                     TenantShardSplitRequest {
-                        new_shard_count: split_req.new_shard_count,
-                        new_stripe_size: split_req.new_stripe_size,
+                        new_shard_count: new_shard_count.literal(),
+                        new_stripe_size,
                     },
                 )
                 .await
                 .map_err(|e| ApiError::Conflict(format!("Failed to split {}: {}", parent_id, e)))?;
 
+            fail::fail_point!("shard-split-post-remote", |_| Err(ApiError::Conflict(
+                "failpoint".to_string()
+            )));
+
             tracing::info!(
                 "Split {} into {}",
                 parent_id,
@@ -2365,62 +3265,13 @@ impl Service {
             .complete_shard_split(tenant_id, old_shard_count)
             .await?;
 
+        fail::fail_point!("shard-split-post-complete", |_| Err(
+            ApiError::InternalServerError(anyhow::anyhow!("failpoint"))
+        ));
+
         // Replace all the shards we just split with their children: this phase is infallible.
-        let mut response = TenantShardSplitResponse {
-            new_shards: Vec::new(),
-        };
-        let mut child_locations = Vec::new();
-        {
-            let mut locked = self.inner.write().unwrap();
-            let (_nodes, tenants, scheduler) = locked.parts_mut();
-            for target in targets {
-                let SplitTarget {
-                    parent_id,
-                    node: _node,
-                    child_ids,
-                } = target;
-                let (pageserver, generation, config) = {
-                    let mut old_state = tenants
-                        .remove(&parent_id)
-                        .expect("It was present, we just split it");
-                    let old_attached = old_state.intent.get_attached().unwrap();
-                    old_state.intent.clear(scheduler);
-                    let generation = old_state.generation.expect("Shard must have been attached");
-                    (old_attached, generation, old_state.config.clone())
-                };
-
-                for child in child_ids {
-                    let mut child_shard = shard_ident;
-                    child_shard.number = child.shard_number;
-                    child_shard.count = child.shard_count;
-
-                    let mut child_observed: HashMap<NodeId, ObservedStateLocation> = HashMap::new();
-                    child_observed.insert(
-                        pageserver,
-                        ObservedStateLocation {
-                            conf: Some(attached_location_conf(generation, &child_shard, &config)),
-                        },
-                    );
-
-                    let mut child_state = TenantState::new(child, child_shard, policy.clone());
-                    child_state.intent = IntentState::single(scheduler, Some(pageserver));
-                    child_state.observed = ObservedState {
-                        locations: child_observed,
-                    };
-                    child_state.generation = Some(generation);
-                    child_state.config = config.clone();
-
-                    // The child's TenantState::splitting is intentionally left at the default value of Idle,
-                    // as at this point in the split process we have succeeded and this part is infallible:
-                    // we will never need to do any special recovery from this state.
-
-                    child_locations.push((child, pageserver, child_shard.stripe_size));
-
-                    tenants.insert(child, child_state);
-                    response.new_shards.push(child);
-                }
-            }
-        }
+        let (response, child_locations) =
+            self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size);
 
         // Send compute notifications for all the new shards
         let mut failed_notifications = Vec::new();
@@ -2485,17 +3336,15 @@ impl Service {
                 let old_attached = *shard.intent.get_attached();
 
                 match shard.policy {
-                    PlacementPolicy::Single => {
-                        shard.intent.clear_secondary(scheduler);
-                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
-                    }
-                    PlacementPolicy::Double(_n) => {
+                    PlacementPolicy::Attached(n) => {
                         // If our new attached node was a secondary, it no longer should be.
                         shard.intent.remove_secondary(scheduler, migrate_req.node_id);
 
                         // If we were already attached to something, demote that to a secondary
                         if let Some(old_attached) = old_attached {
-                            shard.intent.push_secondary(scheduler, old_attached);
+                            if n > 0 {
+                                shard.intent.push_secondary(scheduler, old_attached);
+                            }
                         }
 
                         shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
@@ -2709,6 +3558,8 @@ impl Service {
         &self,
         register_req: NodeRegisterRequest,
     ) -> Result<(), ApiError> {
+        let _node_lock = self.node_op_locks.exclusive(register_req.node_id).await;
+
         // Pre-check for an already-existing node
         {
             let locked = self.inner.read().unwrap();
@@ -2736,6 +3587,30 @@ impl Service {
             }
         }
 
+        // We do not require that a node is actually online when registered (it will start life
+        // with it's  availability set to Offline), but we _do_ require that its DNS record exists. We're
+        // therefore not immune to asymmetric L3 connectivity issues, but we are protected against nodes
+        // that register themselves with a broken DNS config.  We check only the HTTP hostname, because
+        // the postgres hostname might only be resolvable to clients (e.g. if we're on a different VPC than clients).
+        if tokio::net::lookup_host(format!(
+            "{}:{}",
+            register_req.listen_http_addr, register_req.listen_http_port
+        ))
+        .await
+        .is_err()
+        {
+            // If we have a transient DNS issue, it's up to the caller to retry their registration.  Because
+            // we can't robustly distinguish between an intermittent issue and a totally bogus DNS situation,
+            // we return a soft 503 error, to encourage callers to retry past transient issues.
+            return Err(ApiError::ResourceUnavailable(
+                format!(
+                    "Node {} tried to register with unknown DNS name '{}'",
+                    register_req.node_id, register_req.listen_http_addr
+                )
+                .into(),
+            ));
+        }
+
         // Ordering: we must persist the new node _before_ adding it to in-memory state.
         // This ensures that before we use it for anything or expose it via any external
         // API, it is guaranteed to be available after a restart.
@@ -2768,34 +3643,65 @@ impl Service {
 
     pub(crate) async fn node_configure(
         &self,
-        config_req: NodeConfigureRequest,
+        node_id: NodeId,
+        availability: Option<NodeAvailability>,
+        scheduling: Option<NodeSchedulingPolicy>,
     ) -> Result<(), ApiError> {
-        if let Some(scheduling) = config_req.scheduling {
+        let _node_lock = self.node_op_locks.exclusive(node_id).await;
+
+        if let Some(scheduling) = scheduling {
             // Scheduling is a persistent part of Node: we must write updates to the database before
             // applying them in memory
-            self.persistence
-                .update_node(config_req.node_id, scheduling)
-                .await?;
+            self.persistence.update_node(node_id, scheduling).await?;
         }
 
+        // If we're activating a node, then before setting it active we must reconcile any shard locations
+        // on that node, in case it is out of sync, e.g. due to being unavailable during controller startup,
+        // by calling [`Self::node_activate_reconcile`]
+        //
+        // The transition we calculate here remains valid later in the function because we hold the op lock on the node:
+        // nothing else can mutate its availability while we run.
+        let availability_transition = if let Some(input_availability) = availability {
+            let (activate_node, availability_transition) = {
+                let locked = self.inner.read().unwrap();
+                let Some(node) = locked.nodes.get(&node_id) else {
+                    return Err(ApiError::NotFound(
+                        anyhow::anyhow!("Node {} not registered", node_id).into(),
+                    ));
+                };
+
+                (
+                    node.clone(),
+                    node.get_availability_transition(input_availability),
+                )
+            };
+
+            if matches!(availability_transition, AvailabilityTransition::ToActive) {
+                self.node_activate_reconcile(activate_node, &_node_lock)
+                    .await?;
+            }
+            availability_transition
+        } else {
+            AvailabilityTransition::Unchanged
+        };
+
+        // Apply changes from the request to our in-memory state for the Node
         let mut locked = self.inner.write().unwrap();
         let (nodes, tenants, scheduler) = locked.parts_mut();
 
         let mut new_nodes = (**nodes).clone();
 
-        let Some(node) = new_nodes.get_mut(&config_req.node_id) else {
+        let Some(node) = new_nodes.get_mut(&node_id) else {
             return Err(ApiError::NotFound(
                 anyhow::anyhow!("Node not registered").into(),
             ));
         };
 
-        let availability_transition = if let Some(availability) = &config_req.availability {
-            node.set_availability(*availability)
-        } else {
-            AvailabilityTransition::Unchanged
-        };
+        if let Some(availability) = &availability {
+            node.set_availability(*availability);
+        }
 
-        if let Some(scheduling) = config_req.scheduling {
+        if let Some(scheduling) = scheduling {
             node.set_scheduling(scheduling);
 
             // TODO: once we have a background scheduling ticker for fill/drain, kick it
@@ -2807,27 +3713,33 @@ impl Service {
 
         let new_nodes = Arc::new(new_nodes);
 
+        // Modify scheduling state for any Tenants that are affected by a change in the node's availability state.
         match availability_transition {
             AvailabilityTransition::ToOffline => {
-                tracing::info!("Node {} transition to offline", config_req.node_id);
+                tracing::info!("Node {} transition to offline", node_id);
                 let mut tenants_affected: usize = 0;
                 for (tenant_shard_id, tenant_state) in tenants {
-                    if let Some(observed_loc) =
-                        tenant_state.observed.locations.get_mut(&config_req.node_id)
-                    {
+                    if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) {
                         // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
                         // not assume our knowledge of the node's configuration is accurate until it comes back online
                         observed_loc.conf = None;
                     }
 
-                    if tenant_state.intent.demote_attached(config_req.node_id) {
+                    if new_nodes.len() == 1 {
+                        // Special case for single-node cluster: there is no point trying to reschedule
+                        // any tenant shards: avoid doing so, in order to avoid spewing warnings about
+                        // failures to schedule them.
+                        continue;
+                    }
+
+                    if tenant_state.intent.demote_attached(node_id) {
                         tenant_state.sequence = tenant_state.sequence.next();
                         match tenant_state.schedule(scheduler) {
                             Err(e) => {
                                 // It is possible that some tenants will become unschedulable when too many pageservers
                                 // go offline: in this case there isn't much we can do other than make the issue observable.
                                 // TODO: give TenantState a scheduling error attribute to be queried later.
-                                tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", config_req.node_id);
+                                tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id);
                             }
                             Ok(()) => {
                                 if self
@@ -2843,17 +3755,15 @@ impl Service {
                 tracing::info!(
                     "Launched {} reconciler tasks for tenants affected by node {} going offline",
                     tenants_affected,
-                    config_req.node_id
+                    node_id
                 )
             }
             AvailabilityTransition::ToActive => {
-                tracing::info!("Node {} transition to active", config_req.node_id);
+                tracing::info!("Node {} transition to active", node_id);
                 // When a node comes back online, we must reconcile any tenant that has a None observed
                 // location on the node.
                 for tenant_state in locked.tenants.values_mut() {
-                    if let Some(observed_loc) =
-                        tenant_state.observed.locations.get_mut(&config_req.node_id)
-                    {
+                    if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) {
                         if observed_loc.conf.is_none() {
                             self.maybe_reconcile_shard(tenant_state, &new_nodes);
                         }
@@ -2863,7 +3773,7 @@ impl Service {
                 // TODO: in the background, we should balance work back onto this pageserver
             }
             AvailabilityTransition::Unchanged => {
-                tracing::info!("Node {} no change during config", config_req.node_id);
+                tracing::info!("Node {} no change during config", node_id);
             }
         }
 
@@ -2942,17 +3852,23 @@ impl Service {
         )
     }
 
-    /// Check all tenants for pending reconciliation work, and reconcile those in need
+    /// Check all tenants for pending reconciliation work, and reconcile those in need.
+    /// Additionally, reschedule tenants that require it.
     ///
     /// Returns how many reconciliation tasks were started
     fn reconcile_all(&self) -> usize {
         let mut locked = self.inner.write().unwrap();
-        let pageservers = locked.nodes.clone();
-        locked
-            .tenants
-            .iter_mut()
-            .filter_map(|(_tenant_shard_id, shard)| self.maybe_reconcile_shard(shard, &pageservers))
-            .count()
+        let (nodes, tenants, _scheduler) = locked.parts_mut();
+        let pageservers = nodes.clone();
+
+        let mut reconciles_spawned = 0;
+        for (_tenant_shard_id, shard) in tenants.iter_mut() {
+            if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
+                reconciles_spawned += 1;
+            }
+        }
+
+        reconciles_spawned
     }
 
     pub async fn shutdown(&self) {
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index 3c91e09ac3..9dd368bf41 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -457,22 +457,7 @@ impl TenantState {
         // Add/remove nodes to fulfil policy
         use PlacementPolicy::*;
         match self.policy {
-            Single => {
-                // Should have exactly one attached, and zero secondaries
-                if !self.intent.secondary.is_empty() {
-                    self.intent.clear_secondary(scheduler);
-                    modified = true;
-                }
-
-                let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
-                modified |= modified_attached;
-
-                if !self.intent.secondary.is_empty() {
-                    self.intent.clear_secondary(scheduler);
-                    modified = true;
-                }
-            }
-            Double(secondary_count) => {
+            Attached(secondary_count) => {
                 let retain_secondaries = if self.intent.attached.is_none()
                     && scheduler.node_preferred(&self.intent.secondary).is_some()
                 {
@@ -577,7 +562,12 @@ impl TenantState {
                 .generation
                 .expect("Attempted to enter attached state without a generation");
 
-            let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
+            let wanted_conf = attached_location_conf(
+                generation,
+                &self.shard,
+                &self.config,
+                !self.intent.secondary.is_empty(),
+            );
             match self.observed.locations.get(&node_id) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                 Some(_) | None => {
@@ -890,7 +880,7 @@ pub(crate) mod tests {
 
         let mut scheduler = Scheduler::new(nodes.values());
 
-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
         tenant_state
             .schedule(&mut scheduler)
             .expect("we have enough nodes, scheduling should work");
@@ -938,7 +928,7 @@ pub(crate) mod tests {
         let nodes = make_test_nodes(3);
         let mut scheduler = Scheduler::new(nodes.values());
 
-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
 
         tenant_state.observed.locations.insert(
             NodeId(3),
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 6c722f36b4..401feae706 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -437,7 +437,7 @@ async fn handle_tenant(
 
             let placement_policy = match create_match.get_one::<String>("placement-policy") {
                 Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
-                _ => PlacementPolicy::Single,
+                _ => PlacementPolicy::Attached(0),
             };
 
             let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
@@ -523,88 +523,6 @@ async fn handle_tenant(
                 .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
             println!("tenant {tenant_id} successfully configured on the pageserver");
         }
-        Some(("migrate", matches)) => {
-            let tenant_shard_id = get_tenant_shard_id(matches, env)?;
-            let new_pageserver = get_pageserver(env, matches)?;
-            let new_pageserver_id = new_pageserver.conf.id;
-
-            let storage_controller = StorageController::from_env(env);
-            storage_controller
-                .tenant_migrate(tenant_shard_id, new_pageserver_id)
-                .await?;
-
-            println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
-        }
-        Some(("status", matches)) => {
-            let tenant_id = get_tenant_id(matches, env)?;
-
-            let mut shard_table = comfy_table::Table::new();
-            shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
-
-            let mut tenant_synthetic_size = None;
-
-            let storage_controller = StorageController::from_env(env);
-            for shard in storage_controller.tenant_locate(tenant_id).await?.shards {
-                let pageserver =
-                    PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
-
-                let size = pageserver
-                    .http_client
-                    .tenant_details(shard.shard_id)
-                    .await?
-                    .tenant_info
-                    .current_physical_size
-                    .unwrap();
-
-                shard_table.add_row([
-                    format!("{}", shard.shard_id.shard_slug()),
-                    format!("{}", shard.node_id.0),
-                    format!("{} MiB", size / (1024 * 1024)),
-                ]);
-
-                if shard.shard_id.is_zero() {
-                    tenant_synthetic_size =
-                        Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
-                }
-            }
-
-            let Some(synthetic_size) = tenant_synthetic_size else {
-                bail!("Shard 0 not found")
-            };
-
-            let mut tenant_table = comfy_table::Table::new();
-            tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
-            tenant_table.add_row([
-                "Synthetic size".to_string(),
-                format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
-            ]);
-
-            println!("{tenant_table}");
-            println!("{shard_table}");
-        }
-        Some(("shard-split", matches)) => {
-            let tenant_id = get_tenant_id(matches, env)?;
-            let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
-            let shard_stripe_size: Option<ShardStripeSize> = matches
-                .get_one::<Option<ShardStripeSize>>("shard-stripe-size")
-                .cloned()
-                .unwrap();
-
-            let storage_controller = StorageController::from_env(env);
-            let result = storage_controller
-                .tenant_split(tenant_id, shard_count, shard_stripe_size)
-                .await?;
-            println!(
-                "Split tenant {} into shards {}",
-                tenant_id,
-                result
-                    .new_shards
-                    .iter()
-                    .map(|s| format!("{:?}", s))
-                    .collect::<Vec<_>>()
-                    .join(",")
-            );
-        }
 
         Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
         None => bail!("no tenant subcommand provided"),
@@ -1578,19 +1496,6 @@ fn cli() -> Command {
             .subcommand(Command::new("config")
                 .arg(tenant_id_arg.clone())
                 .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
-            .subcommand(Command::new("migrate")
-                .about("Migrate a tenant from one pageserver to another")
-                .arg(tenant_id_arg.clone())
-                .arg(pageserver_id_arg.clone()))
-            .subcommand(Command::new("status")
-                .about("Human readable summary of the tenant's shards and attachment locations")
-                .arg(tenant_id_arg.clone()))
-            .subcommand(Command::new("shard-split")
-                .about("Increase the number of shards in the tenant")
-                .arg(tenant_id_arg.clone())
-                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
-                .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
-                )
         )
         .subcommand(
             Command::new("pageserver")
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 2e64489432..bd3dbef453 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -114,7 +114,7 @@ impl NeonBroker {
 }
 
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
-#[serde(default)]
+#[serde(default, deny_unknown_fields)]
 pub struct PageServerConf {
     // node id
     pub id: NodeId,
@@ -126,6 +126,9 @@ pub struct PageServerConf {
     // auth type used for the PG and HTTP ports
     pub pg_auth_type: AuthType,
     pub http_auth_type: AuthType,
+
+    pub(crate) virtual_file_io_engine: Option<String>,
+    pub(crate) get_vectored_impl: Option<String>,
 }
 
 impl Default for PageServerConf {
@@ -136,6 +139,8 @@ impl Default for PageServerConf {
             listen_http_addr: String::new(),
             pg_auth_type: AuthType::Trust,
             http_auth_type: AuthType::Trust,
+            virtual_file_io_engine: None,
+            get_vectored_impl: None,
         }
     }
 }
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 06ec942895..c5eabc46db 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -78,18 +78,39 @@ impl PageServerNode {
     ///
     /// These all end up on the command line of the `pageserver` binary.
     fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
-        let id = format!("id={}", self.conf.id);
         // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
         let pg_distrib_dir_param = format!(
             "pg_distrib_dir='{}'",
             self.env.pg_distrib_dir_raw().display()
         );
 
-        let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type);
-        let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr);
+        let PageServerConf {
+            id,
+            listen_pg_addr,
+            listen_http_addr,
+            pg_auth_type,
+            http_auth_type,
+            virtual_file_io_engine,
+            get_vectored_impl,
+        } = &self.conf;
 
-        let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type);
-        let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr);
+        let id = format!("id={}", id);
+
+        let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
+        let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
+
+        let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
+        let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
+        let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
+            format!("virtual_file_io_engine='{virtual_file_io_engine}'")
+        } else {
+            String::new()
+        };
+        let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
+            format!("get_vectored_impl='{get_vectored_impl}'")
+        } else {
+            String::new()
+        };
 
         let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
 
@@ -101,6 +122,8 @@ impl PageServerNode {
             listen_http_addr_param,
             listen_pg_addr_param,
             broker_endpoint_param,
+            virtual_file_io_engine,
+            get_vectored_impl,
         ];
 
         if let Some(control_plane_api) = &self.env.control_plane_api {
@@ -111,7 +134,7 @@ impl PageServerNode {
 
             // Storage controller uses the same auth as pageserver: if JWT is enabled
             // for us, we will also need it to talk to them.
-            if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
+            if matches!(http_auth_type, AuthType::NeonJWT) {
                 let jwt_token = self
                     .env
                     .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
@@ -129,8 +152,7 @@ impl PageServerNode {
             ));
         }
 
-        if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
-        {
+        if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
             // Keys are generated in the toplevel repo dir, pageservers' workdirs
             // are one level below that, so refer to keys with ../
             overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
@@ -554,13 +576,6 @@ impl PageServerNode {
         Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
     }
 
-    pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
-        Ok(self
-            .http_client
-            .tenant_secondary_download(*tenant_id)
-            .await?)
-    }
-
     pub async fn timeline_create(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index d7673f1b26..e7697ecac8 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -38,6 +38,9 @@ const COMMAND: &str = "storage_controller";
 
 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
 
+// Use a shorter pageserver unavailability interval than the default to speed up tests.
+const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
+
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
     pub tenant_shard_id: TenantShardId,
@@ -269,6 +272,8 @@ impl StorageController {
         // Run migrations on every startup, in case something changed.
         let database_url = self.setup_database().await?;
 
+        let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
+
         let mut args = vec![
             "-l",
             &self.listen,
@@ -276,6 +281,8 @@ impl StorageController {
             self.path.as_ref(),
             "--database-url",
             &database_url,
+            "--max-unavailable-interval",
+            &max_unavailable.to_string(),
         ]
         .into_iter()
         .map(|s| s.to_string())
@@ -468,7 +475,7 @@ impl StorageController {
     pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
         self.dispatch::<(), _>(
             Method::GET,
-            format!("control/v1/tenant/{tenant_id}/locate"),
+            format!("debug/v1/tenant/{tenant_id}/locate"),
             None,
         )
         .await
diff --git a/docs/rfcs/031-sharding-static.md b/docs/rfcs/031-sharding-static.md
new file mode 100644
index 0000000000..fe009b8660
--- /dev/null
+++ b/docs/rfcs/031-sharding-static.md
@@ -0,0 +1,408 @@
+# Sharding Phase 1: Static Key-space Sharding
+
+## Summary
+
+To enable databases with sizes approaching the capacity of a pageserver's disk,
+it is necessary to break up the storage for the database, or _shard_ it.
+
+Sharding in general is a complex area. This RFC aims to define an initial
+capability that will permit creating large-capacity databases using a static configuration
+defined at time of Tenant creation.
+
+## Motivation
+
+Currently, all data for a Tenant, including all its timelines, is stored on a single
+pageserver. The local storage required may be several times larger than the actual
+database size, due to LSM write inflation.
+
+If a database is larger than what one pageserver can hold, then it becomes impossible
+for the pageserver to hold it in local storage, as it must do to provide service to
+clients.
+
+### Prior art
+
+In Neon:
+
+- Layer File Spreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Konstantin-21fd9b11b618475da5f39c61dd8ab7a4
+- Layer File SPreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843
+- Key Space partitioning: https://www.notion.so/neondatabase/One-Pager-Key-Space-Partitioning-Stas-8e3a28a600a04a25a68523f42a170677
+
+Prior art in other distributed systems is too broad to capture here: pretty much
+any scale out storage system does something like this.
+
+## Requirements
+
+- Enable creating a large (for example, 16TiB) database without requiring dedicated
+  pageserver nodes.
+- Share read/write bandwidth costs for large databases across pageservers, as well
+  as storage capacity, in order to avoid large capacity databases acting as I/O hotspots
+  that disrupt service to other tenants.
+- Our data distribution scheme should handle sparse/nonuniform keys well, since postgres
+  does not write out a single contiguous ranges of page numbers.
+
+_Note: the definition of 'large database' is arbitrary, but the lower bound is to ensure that a database
+that a user might create on a current-gen enterprise SSD should also work well on
+Neon. The upper bound is whatever postgres can handle: i.e. we must make sure that the
+pageserver backend is not the limiting factor in the database size_.
+
+## Non Goals
+
+- Independently distributing timelines within the same tenant. If a tenant has many
+  timelines, then sharding may be a less efficient mechanism for distributing load than
+  sharing out timelines between pageservers.
+- Distributing work in the LSN dimension: this RFC focuses on the Key dimension only,
+  based on the idea that separate mechanisms will make sense for each dimension.
+
+## Impacted Components
+
+pageserver, control plane, postgres/smgr
+
+## Terminology
+
+**Key**: a postgres page number, qualified by relation. In the sense that the pageserver is a versioned key-value store,
+the page number is the key in that store. `Key` is a literal data type in existing code.
+
+**LSN dimension**: this just means the range of LSNs (history), when talking about the range
+of keys and LSNs as a two dimensional space.
+
+## Implementation
+
+### Key sharding vs. LSN sharding
+
+When we think of sharding across the two dimensional key/lsn space, this is an
+opportunity to think about how the two dimensions differ:
+
+- Sharding the key space distributes the _write_ workload of ingesting data
+  and compacting. This work must be carefully managed so that exactly one
+  node owns a given key.
+- Sharding the LSN space distributes the _historical read_ workload. This work
+  can be done by anyone without any special coordination, as long as they can
+  see the remote index and layers.
+
+The key sharding is the harder part, and also the more urgent one, to support larger
+capacity databases. Because distributing historical LSN read work is a relatively
+simpler problem that most users don't have, we defer it to future work. It is anticipated
+that some quite simple P2P offload model will enable distributing work for historical
+reads: a node which is low on space can call out to peer to ask it to download and
+serve reads from a historical layer.
+
+### Key mapping scheme
+
+Having decided to focus on key sharding, we must next decide how we will map
+keys to shards. It is proposed to use a "wide striping" approach, to obtain a good compromise
+between data locality and avoiding entire large relations mapping to the same shard.
+
+We will define two spaces:
+
+- Key space: unsigned integer
+- Shard space: integer from 0 to N-1, where we have N shards.
+
+### Key -> Shard mapping
+
+Keys are currently defined in the pageserver's getpage@lsn interface as follows:
+
+```
+pub struct Key {
+    pub field1: u8,
+    pub field2: u32,
+    pub field3: u32,
+    pub field4: u32,
+    pub field5: u8,
+    pub field6: u32,
+}
+
+
+fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
+    Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum,
+        field6: blknum,
+    }
+}
+```
+
+_Note: keys for relation metadata are ignored here, as this data will be mirrored to all
+shards. For distribution purposes, we only care about user data keys_
+
+The properties we want from our Key->Shard mapping are:
+
+- Locality in `blknum`, such that adjacent `blknum` will usually map to
+  the same stripe and consequently land on the same shard, even though the overall
+  collection of blocks in a relation will be spread over many stripes and therefore
+  many shards.
+- Avoid the same blknum on different relations landing on the same stripe, so that
+  with many small relations we do not end up aliasing data to the same stripe/shard.
+- Avoid vulnerability to aliasing in the values of relation identity fields, such that
+  if there are patterns in the value of `relnode`, these do not manifest as patterns
+  in data placement.
+
+To accomplish this, the blknum is used to select a stripe, and stripes are
+assigned to shards in a pseudorandom order via a hash. The motivation for
+pseudo-random distribution (rather than sequential mapping of stripe to shard)
+is to avoid I/O hotspots when sequentially reading multiple relations: we don't want
+all relations' stripes to touch pageservers in the same order.
+
+To map a `Key` to a shard:
+
+- Hash the `Key` field 4 (relNode).
+- Divide field 6 (`blknum`) field by the stripe size in pages, and combine the
+  hash of this with the hash from the previous step.
+- The total hash modulo the shard count gives the shard holding this key.
+
+Why don't we use the other fields in the Key?
+
+- We ignore `forknum` for key mapping, because it distinguishes different classes of data
+  in the same relation, and we would like to keep the data in a relation together.
+- We would like to use spcNode and dbNode, but cannot. Postgres database creation operations can refer to an existing database as a template, such that the created
+  database's blocks differ only by spcNode and dbNode from the original. To enable running
+  this type of creation without cross-pageserver communication, we must ensure that these
+  blocks map to the same shard -- we do this by excluding spcNode and dbNode from the hash.
+
+### Data placement examples
+
+For example, consider the extreme large databases cases of postgres data layout in a system with 8 shards
+and a stripe size of 32k pages:
+
+- A single large relation: `blknum` division will break the data up into 4096
+  stripes, which will be scattered across the shards.
+- 4096 relations of of 32k pages each: each relation will map to exactly one stripe,
+  and that stripe will be placed according to the hash of the key fields 4. The
+  data placement will be statistically uniform across shards.
+
+Data placement will be more uneven on smaller databases:
+
+- A tenant with 2 shards and 2 relations of one stripe size each: there is a 50% chance
+  that both relations land on the same shard and no data lands on the other shard.
+- A tenant with 8 shards and one relation of size 12 stripes: 4 shards will have double
+  the data of the other four shards.
+
+These uneven cases for small amounts of data do not matter, as long as the stripe size
+is an order of magnitude smaller than the amount of data we are comfortable holding
+in a single shard: if our system handles shard sizes up to 10-100GB, then it is not an issue if
+a tenant has some shards with 256MB size and some shards with 512MB size, even though
+the standard deviation of shard size within the tenant is very high. Our key mapping
+scheme provides a statistical guarantee that as the tenant's overall data size increases,
+uniformity of placement will improve.
+
+### Important Types
+
+#### `ShardIdentity`
+
+Provides the information needed to know whether a particular key belongs
+to a particular shard:
+
+- Layout version
+- Stripe size
+- Shard count
+- Shard index
+
+This structure's size is constant. Note that if we had used a differnet key
+mapping scheme such as consistent hashing with explicit hash ranges assigned
+to each shard, then the ShardIdentity's size would grow with the shard count: the simpler
+key mapping scheme used here enables a small fixed size ShardIdentity.
+
+### Pageserver changes
+
+#### Structural
+
+Everywhere the Pageserver currently deals with Tenants, it will move to dealing with
+`TenantShard`s, which are just a `Tenant` plus a `ShardIdentity` telling it which part
+of the keyspace it owns. An un-sharded tenant is just a `TenantShard` whose `ShardIdentity`
+covers the whole keyspace.
+
+When the pageserver writes layers and index_part.json to remote storage, it must
+include the shard index & count in the name, to avoid collisions (the count is
+necessary for future-proofing: the count will vary in time). These keys
+will also include a generation number: the [generation numbers](025-generation-numbers.md) system will work
+exactly the same for TenantShards as it does for Tenants today: each shard will have
+its own generation number.
+
+#### Storage Format: Keys
+
+For tenants with >1 shard, layer files implicitly become sparse: within the key
+range described in the layer name, the layer file for a shard will only hold the
+content relevant to stripes assigned to the shard.
+
+For this reason, the LayerFileName within a tenant is no longer unique: different shards
+may use the same LayerFileName to refer to different data. We may solve this simply
+by including the shard number in the keys used for layers.
+
+The shard number will be included as a prefix (as part of tenant ID), like this:
+
+`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/<layer file name>-<generation>`
+
+`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/index_part.json-<generation>`
+
+Reasons for this particular format:
+
+- Use of a prefix is convenient for implementation (no need to carry the shard ID everywhere
+  we construct a layer file name), and enables efficient listing of index_parts within
+  a particular shard-timeline prefix.
+- Including the shard _count_ as well as shard number means that in future when we implement
+  shard splitting, it will be possible for a parent shard and one of its children to write
+  the same layer file without a name collision. For example, a parent shard 0_1 might split
+  into two (0_2, 1_2), and in the process of splitting shard 0_2 could write a layer or index_part
+  that is distinct from what shard 0_1 would have written at the same place.
+
+In practice, we expect shard counts to be relatively small, so a `u8` will be sufficient,
+and therefore the shard part of the path can be a fixed-length hex string like `{:02X}{:02X}`,
+for example a single-shard tenant's prefix will be `0001`.
+
+For backward compatibility, we may define a special `ShardIdentity` that has shard_count==0,
+and use this as a cue to construct paths with no prefix at all.
+
+#### Storage Format: Indices
+
+In the phase 1 described in this RFC, shards only reference layers they write themselves. However,
+when we implement shard splitting in future, it will be useful to enable shards to reference layers
+written by other shards (specifically the parent shard during a split), so that shards don't
+have to exhaustively copy all data into their own shard-prefixed keys.
+
+To enable this, the `IndexPart` structure will be extended to store the (shard number, shard count)
+tuple on each layer, such that it can construct paths for layers written by other shards. This
+naturally raises the question of who "owns" such layers written by ancestral shards: this problem
+will be addressed in phase 2.
+
+For backward compatibility, any index entry without shard information will be assumed to be
+in the legacy shardidentity.
+
+#### WAL Ingest
+
+In Phase 1, all shards will subscribe to the safekeeper to download WAL content. They will filter
+it down to the pages relevant to their shard:
+
+- For ordinary user data writes, only retain a write if it matches the ShardIdentity
+- For metadata describing relations etc, all shards retain these writes.
+
+The pageservers must somehow give the safekeeper correct feedback on remote_consistent_lsn:
+one solution here is for the 0th shard to periodically peek at the IndexParts for all the other shards,
+and have only the 0th shard populate remote_consistent_lsn. However, this is relatively
+expensive: if the safekeeper can be made shard-aware then it could be taught to use
+the max() of all shards' remote_consistent_lsns to decide when to trim the WAL.
+
+#### Compaction/GC
+
+No changes needed.
+
+The pageserver doesn't have to do anything special during compaction
+or GC. It is implicitly operating on the subset of keys that map to its ShardIdentity.
+This will result in sparse layer files, containing keys only in the stripes that this
+shard owns. Where optimizations currently exist in compaction for spotting "gaps" in
+the key range, these should be updated to ignore gaps that are due to sharding, to
+avoid spuriously splitting up layers ito stripe-sized pieces.
+
+### Compute Endpoints
+
+Compute endpoints will need to:
+
+- Accept a vector of connection strings as part of their configuration from the control plane
+- Route pageserver requests according to mapping the hash of key to the correct
+  entry in the vector of connection strings.
+
+Doing this in compute rather than routing requests via a single pageserver is
+necessary to enable sharding tenants without adding latency from extra hops.
+
+### Control Plane
+
+Tenants, or _Projects_ in the control plane, will each own a set of TenantShards (this will
+be 1 for small tenants). Logic for placement of tenant shards is just the same as the current logic for placing
+tenants.
+
+Tenant lifecycle operations like deletion will require fanning-out to all the shards
+in the tenant. The same goes for timeline creation and deletion: a timeline should
+not be considered created until it has been created in all shards.
+
+#### Selectively enabling sharding for large tenants
+
+Initially, we will explicitly enable sharding for large tenants only.
+
+In future, this hint mechanism will become optional when we implement automatic
+re-sharding of tenants.
+
+## Future Phases
+
+This section exists to indicate what will likely come next after this phase.
+
+Phases 2a and 2b are amenable to execution in parallel.
+
+### Phase 2a: WAL fan-out
+
+**Problem**: when all shards consume the whole WAL, the network bandwidth used
+for transmitting the WAL from safekeeper to pageservers is multiplied by a factor
+of the shard count.
+
+Network bandwidth is not our most pressing bottleneck, but it is likely to become
+a problem if we set a modest shard count (~8) on a significant number of tenants,
+especially as those larger tenants which we shard are also likely to have higher
+write bandwidth than average.
+
+### Phase 2b: Shard Splitting
+
+**Problem**: the number of shards in a tenant is defined at creation time and cannot
+be changed. This causes excessive sharding for most small tenants, and an upper
+bound on scale for very large tenants.
+
+To address this, a _splitting_ feature will later be added. One shard can split its
+data into a number of children by doing a special compaction operation to generate
+image layers broken up child-shard-wise, and then writing out an `index_part.json` for
+each child. This will then require external coordination (by the control plane) to
+safely attach these new child shards and then move them around to distribute work.
+The opposite _merging_ operation can also be imagined, but is unlikely to be implemented:
+once a Tenant has been sharded, the marginal efficiency benefit of merging is unlikely to justify
+the risk/complexity of implementing such a rarely-encountered scenario.
+
+### Phase N (future): distributed historical reads
+
+**Problem**: while sharding based on key is good for handling changes in overall
+database size, it is less suitable for spiky/unpredictable changes in the read
+workload to historical layers. Sudden increases in historical reads could result
+in sudden increases in local disk capacity required for a TenantShard.
+
+Example: the extreme case of this would be to run a tenant for a year, then create branches
+with ancestors at monthly intervals. This could lead to a sudden 12x inflation in
+the on-disk capacity footprint of a TenantShard, since it would be serving reads
+from all those disparate historical layers.
+
+If we can respond fast enough, then key-sharding a tenant more finely can help with
+this, but splitting may be a relatively expensive operation and the increased historical
+read load may be transient.
+
+A separate mechanism for handling heavy historical reads could be something like
+a gossip mechanism for pageservers to communicate
+about their workload, and then a getpageatlsn offload mechanism where one pageserver can
+ask another to go read the necessary layers from remote storage to serve the read. This
+requires relativly little coordination because it is read-only: any node can service any
+read. All reads to a particular shard would still flow through one node, but the
+disk capactity & I/O impact of servicing the read would be distributed.
+
+## FAQ/Alternatives
+
+### Why stripe the data, rather than using contiguous ranges of keyspace for each shard?
+
+When a database is growing under a write workload, writes may predominantly hit the
+end of the keyspace, creating a bandwidth hotspot on that shard. Similarly, if the user
+is intensively re-writing a particular relation, if that relation lived in a particular
+shard then it would not achieve our goal of distributing the write work across shards.
+
+### Why not proxy read requests through one pageserver, so that endpoints don't have to change?
+
+1. This would not achieve scale-out of network bandwidth: a busy tenant with a large
+   database would still cause a load hotspot on the pageserver routing its read requests.
+2. The additional hop through the "proxy" pageserver would add latency and overall
+   resource cost (CPU, network bandwidth)
+
+### Layer File Spreading: use one pageserver as the owner of a tenant, and have it spread out work on a per-layer basis to peers
+
+In this model, there would be no explicit sharding of work, but the pageserver to which
+a tenant is attached would not hold all layers on its disk: instead, it would call out
+to peers to have them store some layers, and call out to those peers to request reads
+in those layers.
+
+This mechanism will work well for distributing work in the LSN dimension, but in the key
+space dimension it has the major limitation of requiring one node to handle all
+incoming writes, and compactions. Even if the write workload for a large database
+fits in one pageserver, it will still be a hotspot and such tenants may still
+de-facto require their own pageserver.
diff --git a/docs/rfcs/032-shard-splitting.md b/docs/rfcs/032-shard-splitting.md
new file mode 100644
index 0000000000..d5fbda8415
--- /dev/null
+++ b/docs/rfcs/032-shard-splitting.md
@@ -0,0 +1,479 @@
+# Shard splitting
+
+## Summary
+
+This RFC describes a new pageserver API for splitting an existing tenant shard into
+multiple shards, and describes how to use this API to safely increase the total
+shard count of a tenant.
+
+## Motivation
+
+In the [sharding RFC](031-sharding-static.md), a mechanism was introduced to scale
+tenants beyond the capacity of a single pageserver by breaking up the key space
+into stripes, and distributing these stripes across many pageservers. However,
+the shard count was defined once at tenant creation time and not varied thereafter.
+
+In practice, the expected size of a database is rarely known at creation time, and
+it is inefficient to enable sharding for very small tenants: we need to be
+able to create a tenant with a small number of shards (such as 1), and later expand
+when it becomes clear that the tenant has grown in size to a point where sharding
+is beneficial.
+
+### Prior art
+
+Many distributed systems have the problem of choosing how many shards to create for
+tenants that do not specify an expected size up-front. There are a couple of general
+approaches:
+
+- Write to a key space in order, and start a new shard when the highest key advances
+  past some point. This doesn't work well for Neon, because we write to our key space
+  in many different contiguous ranges (per relation), rather than in one contiguous
+  range. To adapt to this kind of model, we would need a sharding scheme where each
+  relation had its own range of shards, which would be inefficient for the common
+  case of databases with many small relations.
+- Monitor the system, and automatically re-shard at some size threshold. For
+  example in Ceph, the [pg_autoscaler](https://github.com/ceph/ceph/blob/49c27499af4ee9a90f69fcc6bf3597999d6efc7b/src/pybind/mgr/pg_autoscaler/module.py)
+  component monitors the size of each RADOS Pool, and adjusts the number of Placement
+  Groups (Ceph's shard equivalent).
+
+## Requirements
+
+- A configurable capacity limit per-shard is enforced.
+- Changes in shard count do not interrupt service beyond requiring postgres
+  to reconnect (i.e. milliseconds).
+- Human being does not have to choose shard count
+
+## Non Goals
+
+- Shard splitting is always a tenant-global operation: we will not enable splitting
+  one shard while leaving others intact.
+- The inverse operation (shard merging) is not described in this RFC. This is a lower
+  priority than splitting, because databases grow more often than they shrink, and
+  a database with many shards will still work properly if the stored data shrinks, just
+  with slightly more overhead (e.g. redundant WAL replication)
+- Shard splitting is only initiated based on capacity bounds, not load. Splitting
+  a tenant based on load will make sense for some medium-capacity, high-load workloads,
+  but is more complex to reason about and likely is not desirable until we have
+  shard merging to reduce the shard count again if the database becomes less busy.
+
+## Impacted Components
+
+pageserver, storage controller
+
+(the _storage controller_ is the evolution of what was called `attachment_service` in our test environment)
+
+## Terminology
+
+**Parent** shards are the shards that exist before a split. **Child** shards are
+the new shards created during a split.
+
+**Shard** is synonymous with _tenant shard_.
+
+**Shard Index** is the 2-tuple of shard number and shard count, written in
+paths as {:02x}{:02x}, e.g. `0001`.
+
+## Background
+
+In the implementation section, a couple of existing aspects of sharding are important
+to remember:
+
+- Shard identifiers contain the shard number and count, so that "shard 0 of 1" (`0001`) is
+  a distinct shard from "shard 0 of 2" (`0002`). This is the case in key paths, local
+  storage paths, and remote index metadata.
+- Remote layer file paths contain the shard index of the shard that created them, and
+  remote indices contain the same index to enable building the layer file path. A shard's
+  index may reference layers that were created by another shard.
+- Local tenant shard directories include the shard index. All layers downloaded by
+  a tenant shard are stored in this shard-prefixed path, even if those layers were
+  initially created by another shard: tenant shards do not read and write one anothers'
+  paths.
+- The `Tenant` pageserver type represents one tenant _shard_, not the whole tenant.
+  This is for historical reasons and will be cleaned up in future, but the existing
+  name is used here to help comprehension when reading code.
+
+## Implementation
+
+Note: this section focuses on the correctness of the core split process. This will
+be fairly inefficient in a naive implementation, and several important optimizations
+are described in a later section.
+
+There are broadly two parts to the implementation:
+
+1. The pageserver split API, which splits one shard on one pageserver
+2. The overall tenant split proccess which is coordinated by the storage controller,
+   and calls into the pageserver split API as needed.
+
+### Pageserver Split API
+
+The pageserver will expose a new API endpoint at `/v1/tenant/:tenant_shard_id/shard_split`
+that takes the new total shard count in the body.
+
+The pageserver split API operates on one tenant shard, on one pageserver. External
+coordination is required to use it safely, this is described in the later
+'Split procedure' section.
+
+#### Preparation
+
+First identify the shard indices for the new child shards. These are deterministic,
+calculated from the parent shard's index, and the number of children being created (this
+is an input to the API, and validated to be a power of two). In a trivial example, splitting
+0001 in two always results in 0002 and 0102.
+
+Child shard indices are chosen such that the childrens' parts of the keyspace will
+be subsets of the parent's parts of the keyspace.
+
+#### Step 1: write new remote indices
+
+In remote storage, splitting is very simple: we may just write new index_part.json
+objects for each child shard, containing exactly the same layers as the parent shard.
+
+The children will have more data than they need, but this avoids any exhausive
+re-writing or copying of layer files.
+
+The index key path includes a generation number: the parent shard's current
+attached generation number will also be used for the child shards' indices. This
+makes the operation safely retryable: if everything crashes and restarts, we may
+call the split API again on the parent shard, and the result will be some new remote
+indices for the child shards, under a higher generation number.
+
+#### Step 2: start new `Tenant` objects
+
+A new `Tenant` object may be instantiated for each child shard, while the parent
+shard still exists. When calling the tenant_spawn function for this object,
+the remote index from step 1 will be read, and the child shard will start
+to ingest WAL to catch up from whatever was in the remote storage at step 1.
+
+We now wait for child shards' WAL ingestion to catch up with the parent shard,
+so that we can safely tear down the parent shard without risking an availability
+gap to clients reading recent LSNs.
+
+#### Step 3: tear down parent `Tenant` object
+
+Once child shards are running and have caught up with WAL ingest, we no longer
+need the parent shard. Note that clients may still be using it -- when we
+shut it down, any page_service handlers will also shut down, causing clients
+to disconnect. When the client reconnects, it will re-lookup the tenant,
+and hit the child shard instead of the parent (shard lookup from page_service
+should bias toward higher ShardCount shards).
+
+Note that at this stage the page service client has not yet been notified of
+any split. In the trivial single split example:
+
+- Shard 0001 is gone: Tenant object torn down
+- Shards 0002 and 0102 are running on the same pageserver where Shard 0001 used to live.
+- Clients will continue to connect to that server thinking that shard 0001 is there,
+  and all requests will work, because any key that was in shard 0001 is definitely
+  available in either shard 0002 or shard 0102.
+- Eventually, the storage controller (not the pageserver) will decide to migrate
+  some child shards away: at that point it will do a live migration, ensuring
+  that the client has an updated configuration before it detaches anything
+  from the original server.
+
+#### Complete
+
+When we send a 200 response to the split request, we are promising the caller:
+
+- That the child shards are persistent in remote storage
+- That the parent shard has been shut down
+
+This enables the caller to proceed with the overall shard split operation, which
+may involve other shards on other pageservers.
+
+### Storage Controller Split procedure
+
+Splitting a tenant requires calling the pageserver split API, and tracking
+enough state to ensure recovery + completion in the event of any component (pageserver
+or storage controller) crashing (or request timing out) during the split.
+
+1. call the split API on all existing shards. Ensure that the resulting
+   child shards are pinned to their pageservers until _all_ the split calls are done.
+   This pinning may be implemented as a "split bit" on the tenant shards, that
+   blocks any migrations, and also acts as a sign that if we restart, we must go
+   through some recovery steps to resume the split.
+2. Once all the split calls are done, we may unpin the child shards (clear
+   the split bit). The split is now complete: subsequent steps are just migrations,
+   not strictly part of the split.
+3. Try to schedule new pageserver locations for the child shards, using
+   a soft anti-affinity constraint to place shards from the same tenant onto different
+   pageservers.
+
+Updating computes about the new shard count is not necessary until we migrate
+any of the child shards away from the parent's location.
+
+### Recovering from failures
+
+#### Rolling back an incomplete split
+
+An incomplete shard split may be rolled back quite simply, by attaching the parent shards to pageservers,
+and detaching child shards. This will lose any WAL ingested into the children after the parents
+were detached earlier, but the parents will catch up.
+
+No special pageserver API is needed for this. From the storage controllers point of view, the
+procedure is:
+
+1. For all parent shards in the tenant, ensure they are attached
+2. For all child shards, ensure they are not attached
+3. Drop child shards from the storage controller's database, and clear the split bit on the parent shards.
+
+Any remote storage content for child shards is left behind. This is similar to other cases where
+we may leave garbage objects in S3 (e.g. when we upload a layer but crash before uploading an
+index that references it). Future online scrub/cleanup functionality can remove these objects, or
+they will be removed when the tenant is deleted, as tenant deletion lists all objects in the prefix,
+which would include any child shards that were rolled back.
+
+If any timelines had been created on child shards, they will be lost when rolling back. To mitigate
+this, we will **block timeline creation during splitting**, so that we can safely roll back until
+the split is complete, without risking losing timelines.
+
+Rolling back an incomplete split will happen automatically if a split fails due to some fatal
+reason, and will not be accessible via an API:
+
+- A pageserver fails to complete its split API request after too many retries
+- A pageserver returns a fatal unexpected error such as 400 or 500
+- The storage controller database returns a non-retryable error
+- Some internal invariant is violated in the storage controller split code
+
+#### Rolling back a complete split
+
+A complete shard split may be rolled back similarly to an incomplete split, with the following
+modifications:
+
+- The parent shards will no longer exist in the storage controller database, so these must
+  be re-synthesized somehow: the hard part of this is figuring the parent shards' generations. This
+  may be accomplished either by probing in S3, or by retaining some tombstone state for deleted
+  shards in the storage controller database.
+- Any timelines that were created after the split complete will disappear when rolling back
+  to the tenant shards. For this reason, rolling back after a complete split should only
+  be done due to serious issues where loss of recently created timelines is acceptable, or
+  in cases where we have confirmed that no timelines were created in the intervening period.
+- Parent shards' layers must not have been deleted: this property will come "for free" when
+  we first roll out sharding, by simply not implementing deletion of parent layers after
+  a split. When we do implement such deletion (see "Cleaning up parent-shard layers" in the
+  Optimizations section), it should apply a TTL to layers such that we have a
+  defined walltime window in which rollback will be possible.
+
+The storage controller will expose an API for rolling back a complete split, for use
+in the field if we encounter some critical bug with a post-split tenant.
+
+#### Retrying API calls during Pageserver Restart
+
+When a pageserver restarts during a split API call, it may witness on-disk content for both parent and
+child shards from an ongoing split. This does not intrinsically break anything, and the
+pageserver may include all these shards in its `/re-attach` request to the storage controller.
+
+In order to support such restarts, it is important that the storage controller stores
+persistent records of each child shard before it calls into a pageserver, as these child shards
+may require generation increments via a `/re-attach` request.
+
+The pageserver restart will also result in a failed API call from the storage controller's point
+of view. Recall that if _any_ pageserver fails to split, the overall split operation may not
+complete, and all shards must remain pinned to their current pageserver locations until the
+split is done.
+
+The pageserver API calls during splitting will retry on transient errors, so that
+short availability gaps do not result in a failure of the overall operation. The
+split in progress will be automatically rolled back if the threshold for API
+retries is reached (e.g. if a pageserver stays offline for longer than a typical
+restart).
+
+#### Rollback on Storage Controller Restart
+
+On startup, the storage controller will inspect the split bit for tenant shards that
+it loads from the database. If any splits are in progress:
+
+- Database content will be reverted to the parent shards
+- Child shards will be dropped from memory
+- The parent and child shards will be included in the general startup reconciliation that
+  the storage controller does: any child shards will be detached from pageservers because
+  they don't exist in the storage controller's expected set of shards, and parent shards
+  will be attached if they aren't already.
+
+#### Storage controller API request failures/retries
+
+The split request handler will implement idempotency: if the [`Tenant`] requested to split
+doesn't exist, we will check for the would-be child shards, and if they already exist,
+we consider the request complete.
+
+If a request is retried while the original request is still underway, then the split
+request handler will notice an InProgress marker in TenantManager, and return 503
+to encourage the client to backoff/retry. This is the same as the general pageserver
+API handling for calls that try to act on an InProgress shard.
+
+#### Compute start/restart during a split
+
+If a compute starts up during split, it will be configured with the old sharding
+configuration. This will work for reads irrespective of the progress of the split
+as long as no child hards have been migrated away from their original location, and
+this is guaranteed in the split procedure (see earlier section).
+
+#### Pageserver fails permanently during a split
+
+If a pageserver permanently fails (i.e. the storage controller availability state for it
+goes to Offline) while a split is in progress, the splitting operation will roll back, and
+during the roll back it will skip any API calls to the offline pageserver. If the offline
+pageserver becomes available again, any stale locations will be cleaned up via the normal reconciliation process (the `/re-attach` API).
+
+### Handling secondary locations
+
+For correctness, it is not necessary to split secondary locations. We can simply detach
+the secondary locations for parent shards, and then attach new secondary locations
+for child shards.
+
+Clearly this is not optimal, as it will result in re-downloads of layer files that
+were already present on disk. See "Splitting secondary locations"
+
+### Conditions to trigger a split
+
+The pageserver will expose a new API for reporting on shards that are candidates
+for split: this will return a top-N report of the largest tenant shards by
+physical size (remote size). This should exclude any tenants that are already
+at the maximum configured shard count.
+
+The API would look something like:
+`/v1/top_n_tenant?shard_count_lt=8&sort_by=resident_size`
+
+The storage controller will poll that API across all pageservers it manages at some appropriate interval (e.g. 60 seconds).
+
+A split operation will be started when the tenant exceeds some threshold. This threshold
+should be _less than_ how large we actually want shards to be, perhaps much less. That's to
+minimize the amount of work involved in splitting -- if we want 100GiB shards, we shouldn't
+wait for a tenant to exceed 100GiB before we split anything. Some data analysis of existing
+tenant size distribution may be useful here: if we can make a statement like "usually, if
+a tenant has exceeded 20GiB they're probably going to exceed 100GiB later", then we might
+make our policy to split a tenant at 20GiB.
+
+The finest split we can do is by factors of two, but we can do higher-cardinality splits
+too, and this will help to reduce the overhead of repeatedly re-splitting a tenant
+as it grows. An example of a very simple heuristic for early deployment of the splitting
+feature would be: "Split tenants into 8 shards when their physical size exceeds 64GiB": that
+would give us two kinds of tenant (1 shard and 8 shards), and the confidence that once we had
+split a tenant, it will not need re-splitting soon after.
+
+## Optimizations
+
+### Flush parent shard to remote storage during split
+
+Any data that is in WAL but not remote storage at time of split will need
+to be replayed by child shards when they start for the first time. To minimize
+this work, we may flush the parent shard to remote storage before writing the
+remote indices for child shards.
+
+It is important that this flush is subject to some time bounds: we may be splitting
+in response to a surge of write ingest, so it may be time-critical to split. A
+few seconds to flush latest data should be sufficient to optimize common cases without
+running the risk of holding up a split for a harmful length of time when a parent
+shard is being written heavily. If the flush doesn't complete in time, we may proceed
+to shut down the parent shard and carry on with the split.
+
+### Hard linking parent layers into child shard directories
+
+Before we start the Tenant objects for child shards, we may pre-populate their
+local storage directories with hard links to the layer files already present
+in the parent shard's local directory. When the child shard starts and downloads
+its remote index, it will find all those layer files already present on local disk.
+
+This avoids wasting download capacity and makes splitting faster, but more importantly
+it avoids taking up a factor of N more disk space when splitting 1 shard into N.
+
+This mechanism will work well in typical flows where shards are migrated away
+promptly after a split, but for the general case including what happens when
+layers are evicted and re-downloaded after a split, see the 'Proactive compaction'
+section below.
+
+### Filtering during compaction
+
+Compaction, especially image layer generation, should skip any keys that are
+present in a shard's layer files, but do not match the shard's ShardIdentity's
+is_key_local() check. This avoids carrying around data for longer than necessary
+in post-split compactions.
+
+This was already implemented in https://github.com/neondatabase/neon/pull/6246
+
+### Proactive compaction
+
+In remote storage, there is little reason to rewrite any data on a shard split:
+all the children can reference parent layers via the very cheap write of the child
+index_part.json.
+
+In local storage, things are more nuanced. During the initial split there is no
+capacity cost to duplicating parent layers, if we implement the hard linking
+optimization described above. However, as soon as any layers are evicted from
+local disk and re-downloaded, the downloaded layers will not be hard-links any more:
+they'll have real capacity footprint. That isn't a problem if we migrate child shards
+away from the parent node swiftly, but it risks a significant over-use of local disk
+space if we do not.
+
+For example, if we did an 8-way split of a shard, and then _didn't_ migrate 7 of
+the shards elsewhere, then churned all the layers in all the shards via eviction,
+then we would blow up the storage capacity used on the node by 8x. If we're splitting
+a 100GB shard, that could take the pageserver to the point of exhausting disk space.
+
+To avoid this scenario, we could implement a special compaction mode where we just
+read historic layers, drop unwanted keys, and write back the layer file. This
+is pretty expensive, but useful if we have split a large shard and are not going to
+migrate the child shards away.
+
+The heuristic conditions for triggering such a compaction are:
+
+- A) eviction plus time: if a child shard
+  has existed for more than a time threshold, and has been requested to perform at least one eviction, then it becomes urgent for this child shard to execute a proactive compaction to reduce its storage footprint, at the cost of I/O load.
+- B) resident size plus time: we may inspect the resident layers and calculate how
+  many of them include the overhead of storing pre-split keys. After some time
+  threshold (different to the one in case A) we still have such layers occupying
+  local disk space, then we should proactively compact them.
+
+### Cleaning up parent-shard layers
+
+It is functionally harmless to leave parent shard layers in remote storage indefinitely.
+They would be cleaned up in the event of the tenant's deletion.
+
+As an optimization to avoid leaking remote storage capacity (which costs money), we may
+lazily clean up parent shard layers once no child shards reference them.
+
+This may be done _very_ lazily: e.g. check every PITR interval. The cleanup procedure is:
+
+- list all the key prefixes beginning with the tenant ID, and select those shard prefixes
+  which do not belong to the most-recently-split set of shards (_ancestral shards_, i.e. `shard*count < max(shard_count) over all shards)`, and those shard prefixes which do have the latest shard count (_current shards_)
+- If there are no _ancestral shard_ prefixes found, we have nothing to clean up and
+  may drop out now.
+- find the latest-generation index for each _current shard_, read all and accumulate the set of layers belonging to ancestral shards referenced by these indices.
+- for all ancestral shards, list objects in the prefix and delete any layer which was not
+  referenced by a current shard.
+
+If this cleanup is scheduled for 1-2 PITR periods after the split, there is a good chance that child shards will have written their own image layers covering the whole keyspace, such that all parent shard layers will be deletable.
+
+The cleanup may be done by the scrubber (external process), or we may choose to have
+the zeroth shard in the latest generation do the work -- there is no obstacle to one shard
+reading the other shard's indices at runtime, and we do not require visibility of the
+latest index writes.
+
+Cleanup should be artificially delayed by some period (for example 24 hours) to ensure
+that we retain the option to roll back a split in case of bugs.
+
+### Splitting secondary locations
+
+We may implement a pageserver API similar to the main splitting API, which does a simpler
+operation for secondary locations: it would not write anything to S3, instead it would simply
+create the child shard directory on local disk, hard link in directories from the parent,
+and set up the in memory (TenantSlot) state for the children.
+
+Similar to attached locations, a subset of secondary locations will probably need re-locating
+after the split is complete, to avoid leaving multiple child shards on the same pageservers,
+where they may use excessive space for the tenant.
+
+## FAQ/Alternatives
+
+### What should the thresholds be set to?
+
+Shard size limit: the pre-sharding default capacity quota for databases was 200GiB, so this could be a starting point for the per-shard size limit.
+
+Max shard count:
+
+- The safekeeper overhead to sharding is currently O(N) network bandwidth because
+  the un-filtered WAL is sent to all shards. To avoid this growing out of control,
+  a limit of 8 shards should be temporarily imposed until WAL filtering is implemented
+  on the safekeeper.
+- there is also little benefit to increasing the shard count beyond the number
+  of pageservers in a region.
+
+### Is it worth just rewriting all the data during a split to simplify reasoning about space?
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index c172354e9f..e33bd0f486 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -6,7 +6,10 @@ use std::str::FromStr;
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;
 
-use crate::{models::ShardParameters, shard::TenantShardId};
+use crate::{
+    models::{ShardParameters, TenantConfig},
+    shard::{ShardStripeSize, TenantShardId},
+};
 
 #[derive(Serialize, Deserialize)]
 pub struct TenantCreateResponseShard {
@@ -35,7 +38,7 @@ pub struct NodeRegisterRequest {
 pub struct NodeConfigureRequest {
     pub node_id: NodeId,
 
-    pub availability: Option<NodeAvailability>,
+    pub availability: Option<NodeAvailabilityWrapper>,
     pub scheduling: Option<NodeSchedulingPolicy>,
 }
 
@@ -57,6 +60,31 @@ pub struct TenantLocateResponse {
     pub shard_params: ShardParameters,
 }
 
+#[derive(Serialize, Deserialize)]
+pub struct TenantDescribeResponse {
+    pub shards: Vec<TenantDescribeResponseShard>,
+    pub stripe_size: ShardStripeSize,
+    pub policy: PlacementPolicy,
+    pub config: TenantConfig,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantDescribeResponseShard {
+    pub tenant_shard_id: TenantShardId,
+
+    pub node_attached: Option<NodeId>,
+    pub node_secondary: Vec<NodeId>,
+
+    pub last_error: String,
+
+    /// A task is currently running to reconcile this tenant's intent state with the state on pageservers
+    pub is_reconciling: bool,
+    /// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending.
+    pub is_pending_compute_notification: bool,
+    /// A shard split is currently underway
+    pub is_splitting: bool,
+}
+
 /// Explicitly migrating a particular shard is a low level operation
 /// TODO: higher level "Reschedule tenant" operation where the request
 /// specifies some constraints, e.g. asking it to get off particular node(s)
@@ -66,22 +94,76 @@ pub struct TenantShardMigrateRequest {
     pub node_id: NodeId,
 }
 
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+/// Utilisation score indicating how good a candidate a pageserver
+/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
+/// Lower values are better.
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
+pub struct UtilizationScore(pub u64);
+
+impl UtilizationScore {
+    pub fn worst() -> Self {
+        UtilizationScore(u64::MAX)
+    }
+}
+
+#[derive(Serialize, Clone, Copy)]
+#[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
     // Normal, happy state
-    Active,
+    Active(UtilizationScore),
     // Offline: Tenants shouldn't try to attach here, but they may assume that their
     // secondary locations on this node still exist.  Newly added nodes are in this
     // state until we successfully contact them.
     Offline,
 }
 
+impl PartialEq for NodeAvailability {
+    fn eq(&self, other: &Self) -> bool {
+        use NodeAvailability::*;
+        matches!((self, other), (Active(_), Active(_)) | (Offline, Offline))
+    }
+}
+
+impl Eq for NodeAvailability {}
+
+// This wrapper provides serde functionality and it should only be used to
+// communicate with external callers which don't know or care about the
+// utilisation score of the pageserver it is targeting.
+#[derive(Serialize, Deserialize, Clone)]
+pub enum NodeAvailabilityWrapper {
+    Active,
+    Offline,
+}
+
+impl From<NodeAvailabilityWrapper> for NodeAvailability {
+    fn from(val: NodeAvailabilityWrapper) -> Self {
+        match val {
+            // Assume the worst utilisation score to begin with. It will later be updated by
+            // the heartbeats.
+            NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
+            NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
+        }
+    }
+}
+
+impl From<NodeAvailability> for NodeAvailabilityWrapper {
+    fn from(val: NodeAvailability) -> Self {
+        match val {
+            NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active,
+            NodeAvailability::Offline => NodeAvailabilityWrapper::Offline,
+        }
+    }
+}
+
 impl FromStr for NodeAvailability {
     type Err = anyhow::Error;
 
     fn from_str(s: &str) -> Result<Self, Self::Err> {
         match s {
-            "active" => Ok(Self::Active),
+            // This is used when parsing node configuration requests from neon-local.
+            // Assume the worst possible utilisation score
+            // and let it get updated via the heartbeats.
+            "active" => Ok(Self::Active(UtilizationScore::worst())),
             "offline" => Ok(Self::Offline),
             _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
         }
@@ -127,11 +209,8 @@ impl From<NodeSchedulingPolicy> for String {
 /// to create secondary locations.
 #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
 pub enum PlacementPolicy {
-    /// Cheapest way to attach a tenant: just one pageserver, no secondary
-    Single,
-    /// Production-ready way to attach a tenant: one attached pageserver and
-    /// some number of secondaries.
-    Double(usize),
+    /// Normal live state: one attached pageserver and zero or more secondaries.
+    Attached(usize),
     /// Create one secondary mode locations. This is useful when onboarding
     /// a tenant, or for an idle tenant that we might want to bring online quickly.
     Secondary,
@@ -153,14 +232,14 @@ mod test {
     /// Check stability of PlacementPolicy's serialization
     #[test]
     fn placement_policy_encoding() -> anyhow::Result<()> {
-        let v = PlacementPolicy::Double(1);
+        let v = PlacementPolicy::Attached(1);
         let encoded = serde_json::to_string(&v)?;
-        assert_eq!(encoded, "{\"Double\":1}");
+        assert_eq!(encoded, "{\"Attached\":1}");
         assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
 
-        let v = PlacementPolicy::Single;
+        let v = PlacementPolicy::Detached;
         let encoded = serde_json::to_string(&v)?;
-        assert_eq!(encoded, "\"Single\"");
+        assert_eq!(encoded, "\"Detached\"");
         assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
         Ok(())
     }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index a96cc09158..aad4cc97fc 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -4,6 +4,7 @@ pub mod utilization;
 pub use utilization::PageserverUtilization;
 
 use std::{
+    borrow::Cow,
     collections::HashMap,
     io::{BufRead, Read},
     num::{NonZeroU64, NonZeroUsize},
@@ -426,7 +427,7 @@ pub struct StatusResponse {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
-    pub tenant_id: TenantShardId,
+    pub tenant_id: Option<TenantShardId>,
     #[serde(flatten)]
     pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
@@ -577,7 +578,7 @@ pub struct TimelineInfo {
     pub walreceiver_status: String,
 }
 
-#[derive(Debug, Clone, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LayerMapInfo {
     pub in_memory_layers: Vec<InMemoryLayerInfo>,
     pub historic_layers: Vec<HistoricLayerInfo>,
@@ -595,7 +596,7 @@ pub enum LayerAccessKind {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LayerAccessStatFullDetails {
     pub when_millis_since_epoch: u64,
-    pub task_kind: &'static str,
+    pub task_kind: Cow<'static, str>,
     pub access_kind: LayerAccessKind,
 }
 
@@ -654,23 +655,23 @@ impl LayerResidenceEvent {
     }
 }
 
-#[derive(Debug, Clone, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LayerAccessStats {
     pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
-    pub task_kind_access_flag: Vec<&'static str>,
+    pub task_kind_access_flag: Vec<Cow<'static, str>>,
     pub first: Option<LayerAccessStatFullDetails>,
     pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
     pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
 }
 
-#[derive(Debug, Clone, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 #[serde(tag = "kind")]
 pub enum InMemoryLayerInfo {
     Open { lsn_start: Lsn },
     Frozen { lsn_start: Lsn, lsn_end: Lsn },
 }
 
-#[derive(Debug, Clone, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 #[serde(tag = "kind")]
 pub enum HistoricLayerInfo {
     Delta {
@@ -692,6 +693,32 @@ pub enum HistoricLayerInfo {
     },
 }
 
+impl HistoricLayerInfo {
+    pub fn layer_file_name(&self) -> &str {
+        match self {
+            HistoricLayerInfo::Delta {
+                layer_file_name, ..
+            } => layer_file_name,
+            HistoricLayerInfo::Image {
+                layer_file_name, ..
+            } => layer_file_name,
+        }
+    }
+    pub fn is_remote(&self) -> bool {
+        match self {
+            HistoricLayerInfo::Delta { remote, .. } => *remote,
+            HistoricLayerInfo::Image { remote, .. } => *remote,
+        }
+    }
+    pub fn set_remote(&mut self, value: bool) {
+        let field = match self {
+            HistoricLayerInfo::Delta { remote, .. } => remote,
+            HistoricLayerInfo::Image { remote, .. } => remote,
+        };
+        *field = value;
+    }
+}
+
 #[derive(Debug, Serialize, Deserialize)]
 pub struct DownloadRemoteLayersTaskSpawnRequest {
     pub max_concurrent_downloads: NonZeroUsize,
@@ -724,6 +751,52 @@ pub struct WalRedoManagerStatus {
     pub pid: Option<u32>,
 }
 
+/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
+/// a download job, timing out while waiting for it to run, and then inspecting this status to understand
+/// what's happening.
+#[derive(Default, Debug, Serialize, Deserialize, Clone)]
+pub struct SecondaryProgress {
+    /// The remote storage LastModified time of the heatmap object we last downloaded.
+    #[serde(
+        serialize_with = "opt_ser_rfc3339_millis",
+        deserialize_with = "opt_deser_rfc3339_millis"
+    )]
+    pub heatmap_mtime: Option<SystemTime>,
+
+    /// The number of layers currently on-disk
+    pub layers_downloaded: usize,
+    /// The number of layers in the most recently seen heatmap
+    pub layers_total: usize,
+
+    /// The number of layer bytes currently on-disk
+    pub bytes_downloaded: u64,
+    /// The number of layer bytes in the most recently seen heatmap
+    pub bytes_total: u64,
+}
+
+fn opt_ser_rfc3339_millis<S: serde::Serializer>(
+    ts: &Option<SystemTime>,
+    serializer: S,
+) -> Result<S::Ok, S::Error> {
+    match ts {
+        Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
+        None => serializer.serialize_none(),
+    }
+}
+
+fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+{
+    let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
+    match s {
+        None => Ok(None),
+        Some(s) => humantime::parse_rfc3339(&s)
+            .map_err(serde::de::Error::custom)
+            .map(Some),
+    }
+}
+
 pub mod virtual_file {
     #[derive(
         Copy,
diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs
index 7195a12395..f5984dff5d 100644
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -7,7 +7,7 @@ use std::time::SystemTime;
 ///
 /// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
 /// not handle full u64 values properly.
-#[derive(serde::Serialize, Debug)]
+#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
 pub struct PageserverUtilization {
     /// Used disk space
     #[serde(serialize_with = "ser_saturating_u63")]
@@ -21,7 +21,10 @@ pub struct PageserverUtilization {
     /// When was this snapshot captured, pageserver local time.
     ///
     /// Use millis to give confidence that the value is regenerated often enough.
-    #[serde(serialize_with = "ser_rfc3339_millis")]
+    #[serde(
+        serialize_with = "ser_rfc3339_millis",
+        deserialize_with = "deser_rfc3339_millis"
+    )]
     pub captured_at: SystemTime,
 }
 
@@ -32,6 +35,14 @@ fn ser_rfc3339_millis<S: serde::Serializer>(
     serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
 }
 
+fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+{
+    let s: String = serde::de::Deserialize::deserialize(deserializer)?;
+    humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
+}
+
 /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
 ///
 /// Instead of newtype, use this because a newtype would get require handling deserializing values
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 15f3cd3b80..4a53f485ca 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -18,6 +18,7 @@ camino.workspace = true
 humantime.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
+rand.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 12ec680cb6..5fff3e25c9 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -157,9 +157,8 @@ impl AzureBlobStorage {
             let mut bufs = Vec::new();
             while let Some(part) = response.next().await {
                 let part = part?;
-                let etag_str: &str = part.blob.properties.etag.as_ref();
                 if etag.is_none() {
-                    etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
+                    etag = Some(part.blob.properties.etag);
                 }
                 if last_modified.is_none() {
                     last_modified = Some(part.blob.properties.last_modified.into());
@@ -174,6 +173,16 @@ impl AzureBlobStorage {
                     .map_err(|e| DownloadError::Other(e.into()))?;
                 bufs.push(data);
             }
+
+            if bufs.is_empty() {
+                return Err(DownloadError::Other(anyhow::anyhow!(
+                    "Azure GET response contained no buffers"
+                )));
+            }
+            // unwrap safety: if these were None, bufs would be empty and we would have returned an error already
+            let etag = etag.unwrap();
+            let last_modified = last_modified.unwrap();
+
             Ok(Download {
                 download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
                 etag,
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index b0b69f9155..ab2035f19a 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -42,6 +42,9 @@ pub use self::{
 };
 use s3_bucket::RequestKind;
 
+/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here.
+pub use azure_core::Etag;
+
 pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
 
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
@@ -291,9 +294,9 @@ pub type DownloadStream =
 pub struct Download {
     pub download_stream: DownloadStream,
     /// The last time the file was modified (`last-modified` HTTP header)
-    pub last_modified: Option<SystemTime>,
+    pub last_modified: SystemTime,
     /// A way to identify this specific version of the resource (`etag` HTTP header)
-    pub etag: Option<String>,
+    pub etag: Etag,
     /// Extra key-value data, associated with the current remote file.
     pub metadata: Option<StorageMetadata>,
 }
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 478ad81dc1..313d8226b1 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -10,7 +10,7 @@ use std::{
     io::ErrorKind,
     num::NonZeroU32,
     pin::Pin,
-    time::{Duration, SystemTime},
+    time::{Duration, SystemTime, UNIX_EPOCH},
 };
 
 use anyhow::{bail, ensure, Context};
@@ -30,6 +30,7 @@ use crate::{
 };
 
 use super::{RemoteStorage, StorageMetadata};
+use crate::Etag;
 
 const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
 
@@ -406,35 +407,37 @@ impl RemoteStorage for LocalFs {
         cancel: &CancellationToken,
     ) -> Result<Download, DownloadError> {
         let target_path = from.with_base(&self.storage_root);
-        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
-            let source = ReaderStream::new(
-                fs::OpenOptions::new()
-                    .read(true)
-                    .open(&target_path)
-                    .await
-                    .with_context(|| {
-                        format!("Failed to open source file {target_path:?} to use in the download")
-                    })
-                    .map_err(DownloadError::Other)?,
-            );
 
-            let metadata = self
-                .read_storage_metadata(&target_path)
+        let file_metadata = file_metadata(&target_path).await?;
+
+        let source = ReaderStream::new(
+            fs::OpenOptions::new()
+                .read(true)
+                .open(&target_path)
                 .await
-                .map_err(DownloadError::Other)?;
+                .with_context(|| {
+                    format!("Failed to open source file {target_path:?} to use in the download")
+                })
+                .map_err(DownloadError::Other)?,
+        );
 
-            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
-            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
+        let metadata = self
+            .read_storage_metadata(&target_path)
+            .await
+            .map_err(DownloadError::Other)?;
 
-            Ok(Download {
-                metadata,
-                last_modified: None,
-                etag: None,
-                download_stream: Box::pin(source),
-            })
-        } else {
-            Err(DownloadError::NotFound)
-        }
+        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+        let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
+
+        let etag = mock_etag(&file_metadata);
+        Ok(Download {
+            metadata,
+            last_modified: file_metadata
+                .modified()
+                .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
+            etag,
+            download_stream: Box::pin(source),
+        })
     }
 
     async fn download_byte_range(
@@ -452,50 +455,51 @@ impl RemoteStorage for LocalFs {
                 return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
             }
         }
+
         let target_path = from.with_base(&self.storage_root);
-        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
-            let mut source = tokio::fs::OpenOptions::new()
-                .read(true)
-                .open(&target_path)
-                .await
-                .with_context(|| {
-                    format!("Failed to open source file {target_path:?} to use in the download")
-                })
-                .map_err(DownloadError::Other)?;
-
-            let len = source
-                .metadata()
-                .await
-                .context("query file length")
-                .map_err(DownloadError::Other)?
-                .len();
-
-            source
-                .seek(io::SeekFrom::Start(start_inclusive))
-                .await
-                .context("Failed to seek to the range start in a local storage file")
-                .map_err(DownloadError::Other)?;
-
-            let metadata = self
-                .read_storage_metadata(&target_path)
-                .await
-                .map_err(DownloadError::Other)?;
-
-            let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
-            let source = ReaderStream::new(source);
-
-            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
-            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
-
-            Ok(Download {
-                metadata,
-                last_modified: None,
-                etag: None,
-                download_stream: Box::pin(source),
+        let file_metadata = file_metadata(&target_path).await?;
+        let mut source = tokio::fs::OpenOptions::new()
+            .read(true)
+            .open(&target_path)
+            .await
+            .with_context(|| {
+                format!("Failed to open source file {target_path:?} to use in the download")
             })
-        } else {
-            Err(DownloadError::NotFound)
-        }
+            .map_err(DownloadError::Other)?;
+
+        let len = source
+            .metadata()
+            .await
+            .context("query file length")
+            .map_err(DownloadError::Other)?
+            .len();
+
+        source
+            .seek(io::SeekFrom::Start(start_inclusive))
+            .await
+            .context("Failed to seek to the range start in a local storage file")
+            .map_err(DownloadError::Other)?;
+
+        let metadata = self
+            .read_storage_metadata(&target_path)
+            .await
+            .map_err(DownloadError::Other)?;
+
+        let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
+        let source = ReaderStream::new(source);
+
+        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+        let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
+
+        let etag = mock_etag(&file_metadata);
+        Ok(Download {
+            metadata,
+            last_modified: file_metadata
+                .modified()
+                .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
+            etag,
+            download_stream: Box::pin(source),
+        })
     }
 
     async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
@@ -610,13 +614,22 @@ async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<
     Ok(())
 }
 
-fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
-    if file_path.exists() {
-        ensure!(file_path.is_file(), "file path '{file_path}' is not a file");
-        Ok(true)
-    } else {
-        Ok(false)
-    }
+async fn file_metadata(file_path: &Utf8Path) -> Result<std::fs::Metadata, DownloadError> {
+    tokio::fs::metadata(&file_path).await.map_err(|e| {
+        if e.kind() == ErrorKind::NotFound {
+            DownloadError::NotFound
+        } else {
+            DownloadError::BadInput(e.into())
+        }
+    })
+}
+
+// Use mtime as stand-in for ETag.  We could calculate a meaningful one by md5'ing the contents of files we
+// read, but that's expensive and the local_fs test helper's whole reason for existence is to run small tests
+// quickly, with less overhead than using a mock S3 server.
+fn mock_etag(meta: &std::fs::Metadata) -> Etag {
+    let mtime = meta.modified().expect("Filesystem mtime missing");
+    format!("{}", mtime.duration_since(UNIX_EPOCH).unwrap().as_millis()).into()
 }
 
 #[cfg(test)]
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 438f45fbde..1cb85cfb1b 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -35,8 +35,8 @@ use aws_sdk_s3::{
 };
 use aws_smithy_async::rt::sleep::TokioSleep;
 
-use aws_smithy_types::byte_stream::ByteStream;
 use aws_smithy_types::{body::SdkBody, DateTime};
+use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError};
 use bytes::Bytes;
 use futures::stream::Stream;
 use hyper::Body;
@@ -287,8 +287,17 @@ impl S3Bucket {
         let remaining = self.timeout.saturating_sub(started_at.elapsed());
 
         let metadata = object_output.metadata().cloned().map(StorageMetadata);
-        let etag = object_output.e_tag;
-        let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
+        let etag = object_output
+            .e_tag
+            .ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))?
+            .into();
+        let last_modified = object_output
+            .last_modified
+            .ok_or(DownloadError::Other(anyhow::anyhow!(
+                "Missing LastModified header"
+            )))?
+            .try_into()
+            .map_err(|e: ConversionError| DownloadError::Other(e.into()))?;
 
         let body = object_output.body;
         let body = ByteStreamAsStream::from(body);
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index d8b9824d99..bc5e40e70f 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -118,7 +118,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
     // A little check to ensure that our clock is not too far off from the S3 clock
     {
         let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
-        let last_modified = dl.last_modified.unwrap();
+        let last_modified = dl.last_modified;
         let half_wt = WAIT_TIME.mul_f32(0.5);
         let t0_hwt = t0 + half_wt;
         let t1_hwt = t1 - half_wt;
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index de27ae4e28..b3b50461da 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -13,6 +13,7 @@ testing = ["fail/failpoints"]
 [dependencies]
 arc-swap.workspace = true
 sentry.workspace = true
+async-compression.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
 bincode.workspace = true
@@ -36,6 +37,7 @@ serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
+tokio-tar.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
@@ -46,6 +48,7 @@ strum.workspace = true
 strum_macros.workspace = true
 url.workspace = true
 uuid.workspace = true
+walkdir.workspace = true
 
 pq_proto.workspace = true
 postgres_connection.workspace = true
diff --git a/libs/utils/src/history_buffer.rs b/libs/utils/src/history_buffer.rs
index 1f07f5560f..bd35e2bad6 100644
--- a/libs/utils/src/history_buffer.rs
+++ b/libs/utils/src/history_buffer.rs
@@ -47,9 +47,10 @@ impl<T, const L: usize> ops::Deref for HistoryBufferWithDropCounter<T, L> {
     }
 }
 
-#[derive(serde::Serialize)]
+#[derive(serde::Serialize, serde::Deserialize)]
 struct SerdeRepr<T> {
     buffer: Vec<T>,
+    buffer_size: usize,
     drop_count: u64,
 }
 
@@ -61,6 +62,7 @@ where
         let HistoryBufferWithDropCounter { buffer, drop_count } = value;
         SerdeRepr {
             buffer: buffer.iter().cloned().collect(),
+            buffer_size: L,
             drop_count: *drop_count,
         }
     }
@@ -78,19 +80,52 @@ where
     }
 }
 
+impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter<T, L>
+where
+    T: Clone + serde::Deserialize<'de>,
+{
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let SerdeRepr {
+            buffer: des_buffer,
+            drop_count,
+            buffer_size,
+        } = SerdeRepr::<T>::deserialize(deserializer)?;
+        if buffer_size != L {
+            use serde::de::Error;
+            return Err(D::Error::custom(format!(
+                "invalid buffer_size, expecting {L} got {buffer_size}"
+            )));
+        }
+        let mut buffer = HistoryBuffer::new();
+        buffer.extend(des_buffer);
+        Ok(HistoryBufferWithDropCounter { buffer, drop_count })
+    }
+}
+
 #[cfg(test)]
 mod test {
     use super::HistoryBufferWithDropCounter;
 
     #[test]
     fn test_basics() {
-        let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
+        let mut b = HistoryBufferWithDropCounter::<usize, 2>::default();
         b.write(1);
         b.write(2);
         b.write(3);
         assert!(b.iter().any(|e| *e == 2));
         assert!(b.iter().any(|e| *e == 3));
         assert!(!b.iter().any(|e| *e == 1));
+
+        // round-trip serde
+        let round_tripped: HistoryBufferWithDropCounter<usize, 2> =
+            serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap();
+        assert_eq!(
+            round_tripped.iter().cloned().collect::<Vec<_>>(),
+            b.iter().cloned().collect::<Vec<_>>()
+        );
     }
 
     #[test]
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index fd4e068b39..336ee87570 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -87,6 +87,8 @@ pub mod failpoint_support;
 
 pub mod yielding_loop;
 
+pub mod zstd;
+
 pub mod poison;
 
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
diff --git a/libs/utils/src/pageserver_feedback.rs b/libs/utils/src/pageserver_feedback.rs
index bc8fa7362e..3ddfa44f41 100644
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -29,12 +29,10 @@ pub struct PageserverFeedback {
     // Serialize with RFC3339 format.
     #[serde(with = "serde_systemtime")]
     pub replytime: SystemTime,
+    /// Used to track feedbacks from different shards. Always zero for unsharded tenants.
+    pub shard_number: u32,
 }
 
-// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
-// Do not remove previously available fields because this might be backwards incompatible.
-pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
-
 impl PageserverFeedback {
     pub fn empty() -> PageserverFeedback {
         PageserverFeedback {
@@ -43,6 +41,7 @@ impl PageserverFeedback {
             remote_consistent_lsn: Lsn::INVALID,
             disk_consistent_lsn: Lsn::INVALID,
             replytime: *PG_EPOCH,
+            shard_number: 0,
         }
     }
 
@@ -59,17 +58,26 @@ impl PageserverFeedback {
     //
     // TODO: change serialized fields names once all computes migrate to rename.
     pub fn serialize(&self, buf: &mut BytesMut) {
-        buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
+        let buf_ptr = buf.len();
+        buf.put_u8(0); // # of keys, will be filled later
+        let mut nkeys = 0;
+
+        nkeys += 1;
         buf.put_slice(b"current_timeline_size\0");
         buf.put_i32(8);
         buf.put_u64(self.current_timeline_size);
 
+        nkeys += 1;
         buf.put_slice(b"ps_writelsn\0");
         buf.put_i32(8);
         buf.put_u64(self.last_received_lsn.0);
+
+        nkeys += 1;
         buf.put_slice(b"ps_flushlsn\0");
         buf.put_i32(8);
         buf.put_u64(self.disk_consistent_lsn.0);
+
+        nkeys += 1;
         buf.put_slice(b"ps_applylsn\0");
         buf.put_i32(8);
         buf.put_u64(self.remote_consistent_lsn.0);
@@ -80,9 +88,19 @@ impl PageserverFeedback {
             .expect("failed to serialize pg_replytime earlier than PG_EPOCH")
             .as_micros() as i64;
 
+        nkeys += 1;
         buf.put_slice(b"ps_replytime\0");
         buf.put_i32(8);
         buf.put_i64(timestamp);
+
+        if self.shard_number > 0 {
+            nkeys += 1;
+            buf.put_slice(b"shard_number\0");
+            buf.put_i32(4);
+            buf.put_u32(self.shard_number);
+        }
+
+        buf[buf_ptr] = nkeys;
     }
 
     // Deserialize PageserverFeedback message
@@ -125,9 +143,8 @@ impl PageserverFeedback {
                 }
                 b"shard_number" => {
                     let len = buf.get_i32();
-                    // TODO: this will be implemented in the next update,
-                    //  for now, we just skip the value.
-                    buf.advance(len as usize);
+                    assert_eq!(len, 4);
+                    rf.shard_number = buf.get_u32();
                 }
                 _ => {
                     let len = buf.get_i32();
@@ -200,10 +217,7 @@ mod tests {
         rf.serialize(&mut data);
 
         // Add an extra field to the buffer and adjust number of keys
-        if let Some(first) = data.first_mut() {
-            *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
-        }
-
+        data[0] += 1;
         data.put_slice(b"new_field_one\0");
         data.put_i32(8);
         data.put_u64(42);
diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index 703a6dfd52..a3aee45b58 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -110,6 +110,49 @@ impl<T> OnceCell<T> {
         }
     }
 
+    /// Returns a guard to an existing initialized value, or returns an unique initialization
+    /// permit which can be used to initialize this `OnceCell` using `OnceCell::set`.
+    pub async fn get_or_init_detached(&self) -> Result<Guard<'_, T>, InitPermit> {
+        // It looks like OnceCell::get_or_init could be implemented using this method instead of
+        // duplication. However, that makes the future be !Send due to possibly holding on to the
+        // MutexGuard over an await point.
+        loop {
+            let sem = {
+                let guard = self.inner.lock().unwrap();
+                if guard.value.is_some() {
+                    return Ok(Guard(guard));
+                }
+                guard.init_semaphore.clone()
+            };
+
+            {
+                let permit = {
+                    // increment the count for the duration of queued
+                    let _guard = CountWaitingInitializers::start(self);
+                    sem.acquire().await
+                };
+
+                let Ok(permit) = permit else {
+                    let guard = self.inner.lock().unwrap();
+                    if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
+                        // there was a take_and_deinit in between
+                        continue;
+                    }
+                    assert!(
+                        guard.value.is_some(),
+                        "semaphore got closed, must be initialized"
+                    );
+                    return Ok(Guard(guard));
+                };
+
+                permit.forget();
+            }
+
+            let permit = InitPermit(sem);
+            return Err(permit);
+        }
+    }
+
     /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
     /// to complete initializing the inner value.
     ///
@@ -481,4 +524,39 @@ mod tests {
 
         assert_eq!("t1", *cell.get().unwrap());
     }
+
+    #[tokio::test(start_paused = true)]
+    async fn detached_init_smoke() {
+        let target = OnceCell::default();
+
+        let Err(permit) = target.get_or_init_detached().await else {
+            unreachable!("it is not initialized")
+        };
+
+        tokio::time::timeout(
+            std::time::Duration::from_secs(3600 * 24 * 7 * 365),
+            target.get_or_init(|permit2| async { Ok::<_, Infallible>((11, permit2)) }),
+        )
+        .await
+        .expect_err("should timeout since we are already holding the permit");
+
+        target.set(42, permit);
+
+        let (_answer, permit) = {
+            let mut guard = target
+                .get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
+                .await
+                .unwrap();
+
+            assert_eq!(*guard, 42);
+
+            guard.take_and_deinit()
+        };
+
+        assert!(target.get().is_none());
+
+        target.set(11, permit);
+
+        assert_eq!(*target.get().unwrap(), 11);
+    }
 }
diff --git a/libs/utils/src/zstd.rs b/libs/utils/src/zstd.rs
new file mode 100644
index 0000000000..be2dcc00f5
--- /dev/null
+++ b/libs/utils/src/zstd.rs
@@ -0,0 +1,78 @@
+use std::io::SeekFrom;
+
+use anyhow::{Context, Result};
+use async_compression::{
+    tokio::{bufread::ZstdDecoder, write::ZstdEncoder},
+    zstd::CParameter,
+    Level,
+};
+use camino::Utf8Path;
+use nix::NixPath;
+use tokio::{
+    fs::{File, OpenOptions},
+    io::AsyncBufRead,
+    io::AsyncSeekExt,
+    io::AsyncWriteExt,
+};
+use tokio_tar::{Archive, Builder, HeaderMode};
+use walkdir::WalkDir;
+
+/// Creates a Zstandard tarball.
+pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> {
+    let file = OpenOptions::new()
+        .create(true)
+        .truncate(true)
+        .read(true)
+        .write(true)
+        .open(&tarball)
+        .await
+        .with_context(|| format!("tempfile creation {tarball}"))?;
+
+    let mut paths = Vec::new();
+    for entry in WalkDir::new(path) {
+        let entry = entry?;
+        let metadata = entry.metadata().expect("error getting dir entry metadata");
+        // Also allow directories so that we also get empty directories
+        if !(metadata.is_file() || metadata.is_dir()) {
+            continue;
+        }
+        let path = entry.into_path();
+        paths.push(path);
+    }
+    // Do a sort to get a more consistent listing
+    paths.sort_unstable();
+    let zstd = ZstdEncoder::with_quality_and_params(
+        file,
+        Level::Default,
+        &[CParameter::enable_long_distance_matching(true)],
+    );
+    let mut builder = Builder::new(zstd);
+    // Use reproducible header mode
+    builder.mode(HeaderMode::Deterministic);
+    for p in paths {
+        let rel_path = p.strip_prefix(path)?;
+        if rel_path.is_empty() {
+            // The top directory should not be compressed,
+            // the tar crate doesn't like that
+            continue;
+        }
+        builder.append_path_with_name(&p, rel_path).await?;
+    }
+    let mut zstd = builder.into_inner().await?;
+    zstd.shutdown().await?;
+    let mut compressed = zstd.into_inner();
+    let compressed_len = compressed.metadata().await?.len();
+    compressed.seek(SeekFrom::Start(0)).await?;
+    Ok((compressed, compressed_len))
+}
+
+/// Creates a Zstandard tarball.
+pub async fn extract_zst_tarball(
+    path: &Utf8Path,
+    tarball: impl AsyncBufRead + Unpin,
+) -> Result<()> {
+    let decoder = Box::pin(ZstdDecoder::new(tarball));
+    let mut archive = Archive::new(decoder);
+    archive.unpack(path).await?;
+    Ok(())
+}
diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index f5ed6ebb97..906302e46e 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -324,11 +324,11 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
     }
 }
 
-extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer) {
+extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekeeper) {
     unsafe {
         let callback_data = (*(*wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).process_safekeeper_feedback(&mut (*wp))
+        (*api).process_safekeeper_feedback(&mut (*wp), &mut (*sk));
     }
 }
 
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 734967da3f..14cc3e05a2 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -142,7 +142,7 @@ pub trait ApiImpl {
         todo!()
     }
 
-    fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer) {
+    fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer, _sk: &mut Safekeeper) {
         todo!()
     }
 
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 5adeaffe1a..f304294591 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -89,6 +89,9 @@ enumset = { workspace = true, features = ["serde"]}
 strum.workspace = true
 strum_macros.workspace = true
 
+[target.'cfg(target_os = "linux")'.dependencies]
+procfs.workspace = true
+
 [dev-dependencies]
 criterion.workspace = true
 hex-literal.workspace = true
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 732eb951c9..ab55d2b0a3 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -169,7 +169,7 @@ impl Client {
         self.request(Method::GET, uri, ()).await
     }
 
-    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
+    async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
         &self,
         method: Method,
         uri: U,
@@ -181,7 +181,16 @@ impl Client {
         } else {
             req
         };
-        let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?;
+        req.json(&body).send().await.map_err(Error::ReceiveBody)
+    }
+
+    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
+        &self,
+        method: Method,
+        uri: U,
+        body: B,
+    ) -> Result<reqwest::Response> {
+        let res = self.request_noerror(method, uri, body).await?;
         let response = res.error_from_body().await?;
         Ok(response)
     }
@@ -240,13 +249,26 @@ impl Client {
         Ok(())
     }
 
-    pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
-        let uri = format!(
+    pub async fn tenant_secondary_download(
+        &self,
+        tenant_id: TenantShardId,
+        wait: Option<std::time::Duration>,
+    ) -> Result<(StatusCode, SecondaryProgress)> {
+        let mut path = reqwest::Url::parse(&format!(
             "{}/v1/tenant/{}/secondary/download",
             self.mgmt_api_endpoint, tenant_id
-        );
-        self.request(Method::POST, &uri, ()).await?;
-        Ok(())
+        ))
+        .expect("Cannot build URL");
+
+        if let Some(wait) = wait {
+            path.query_pairs_mut()
+                .append_pair("wait_ms", &format!("{}", wait.as_millis()));
+        }
+
+        let response = self.request(Method::POST, path, ()).await?;
+        let status = response.status();
+        let progress: SecondaryProgress = response.json().await.map_err(Error::ReceiveBody)?;
+        Ok((status, progress))
     }
 
     pub async fn location_config(
@@ -257,7 +279,7 @@ impl Client {
         lazy: bool,
     ) -> Result<()> {
         let req_body = TenantLocationConfigRequest {
-            tenant_id: tenant_shard_id,
+            tenant_id: Some(tenant_shard_id),
             config,
         };
 
@@ -416,4 +438,77 @@ impl Client {
             .await
             .map_err(Error::ReceiveBody)
     }
+
+    pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
+        let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
+        self.get(uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn layer_map_info(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<LayerMapInfo> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/layer",
+            self.mgmt_api_endpoint, tenant_shard_id, timeline_id,
+        );
+        self.get(&uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn layer_evict(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        layer_file_name: &str,
+    ) -> Result<bool> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/layer/{}",
+            self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name
+        );
+        let resp = self.request_noerror(Method::DELETE, &uri, ()).await?;
+        match resp.status() {
+            StatusCode::OK => Ok(true),
+            StatusCode::NOT_MODIFIED => Ok(false),
+            // TODO: dedupe this pattern / introduce separate error variant?
+            status => Err(match resp.json::<HttpErrorBody>().await {
+                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
+                Err(_) => {
+                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
+                }
+            }),
+        }
+    }
+
+    pub async fn layer_ondemand_download(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        layer_file_name: &str,
+    ) -> Result<bool> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/layer/{}",
+            self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name
+        );
+        let resp = self.request_noerror(Method::GET, &uri, ()).await?;
+        match resp.status() {
+            StatusCode::OK => Ok(true),
+            StatusCode::NOT_MODIFIED => Ok(false),
+            // TODO: dedupe this pattern / introduce separate error variant?
+            status => Err(match resp.json::<HttpErrorBody>().await {
+                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
+                Err(_) => {
+                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
+                }
+            }),
+        }
+    }
 }
diff --git a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
new file mode 100644
index 0000000000..197e782dca
--- /dev/null
+++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
@@ -0,0 +1,272 @@
+use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId};
+
+use pageserver_client::mgmt_api;
+use rand::seq::SliceRandom;
+use tracing::{debug, info};
+use utils::id::{TenantTimelineId, TimelineId};
+
+use tokio::{
+    sync::{mpsc, OwnedSemaphorePermit},
+    task::JoinSet,
+};
+
+use std::{
+    num::NonZeroUsize,
+    sync::{
+        atomic::{AtomicU64, Ordering},
+        Arc,
+    },
+    time::{Duration, Instant},
+};
+
+/// Evict & on-demand download random layers.
+#[derive(clap::Parser)]
+pub(crate) struct Args {
+    #[clap(long, default_value = "http://localhost:9898")]
+    mgmt_api_endpoint: String,
+    #[clap(long)]
+    pageserver_jwt: Option<String>,
+    #[clap(long)]
+    runtime: Option<humantime::Duration>,
+    #[clap(long, default_value = "1")]
+    tasks_per_target: NonZeroUsize,
+    #[clap(long, default_value = "1")]
+    concurrency_per_target: NonZeroUsize,
+    /// Probability for sending `latest=true` in the request (uniform distribution).
+    #[clap(long)]
+    limit_to_first_n_targets: Option<usize>,
+    /// Before starting the benchmark, live-reconfigure the pageserver to use the given
+    /// [`pageserver_api::models::virtual_file::IoEngineKind`].
+    #[clap(long)]
+    set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
+    targets: Option<Vec<TenantTimelineId>>,
+}
+
+pub(crate) fn main(args: Args) -> anyhow::Result<()> {
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()?;
+    let task = rt.spawn(main_impl(args));
+    rt.block_on(task).unwrap().unwrap();
+    Ok(())
+}
+
+#[derive(Debug, Default)]
+struct LiveStats {
+    evictions: AtomicU64,
+    downloads: AtomicU64,
+    timeline_restarts: AtomicU64,
+}
+
+impl LiveStats {
+    fn eviction_done(&self) {
+        self.evictions.fetch_add(1, Ordering::Relaxed);
+    }
+    fn download_done(&self) {
+        self.downloads.fetch_add(1, Ordering::Relaxed);
+    }
+    fn timeline_restart_done(&self) {
+        self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
+    }
+}
+
+async fn main_impl(args: Args) -> anyhow::Result<()> {
+    let args: &'static Args = Box::leak(Box::new(args));
+
+    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
+        args.mgmt_api_endpoint.clone(),
+        args.pageserver_jwt.as_deref(),
+    ));
+
+    if let Some(engine_str) = &args.set_io_engine {
+        mgmt_api_client.put_io_engine(engine_str).await?;
+    }
+
+    // discover targets
+    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+        &mgmt_api_client,
+        crate::util::cli::targets::Spec {
+            limit_to_first_n_targets: args.limit_to_first_n_targets,
+            targets: args.targets.clone(),
+        },
+    )
+    .await?;
+
+    let mut tasks = JoinSet::new();
+
+    let live_stats = Arc::new(LiveStats::default());
+    tasks.spawn({
+        let live_stats = Arc::clone(&live_stats);
+        async move {
+            let mut last_at = Instant::now();
+            loop {
+                tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
+                let now = Instant::now();
+                let delta: Duration = now - last_at;
+                last_at = now;
+
+                let LiveStats {
+                    evictions,
+                    downloads,
+                    timeline_restarts,
+                } = &*live_stats;
+                let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
+                let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
+                let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
+                info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}");
+            }
+        }
+    });
+
+    for tl in timelines {
+        for _ in 0..args.tasks_per_target.get() {
+            tasks.spawn(timeline_actor(
+                args,
+                Arc::clone(&mgmt_api_client),
+                tl,
+                Arc::clone(&live_stats),
+            ));
+        }
+    }
+
+    while let Some(res) = tasks.join_next().await {
+        res.unwrap();
+    }
+    Ok(())
+}
+
+async fn timeline_actor(
+    args: &'static Args,
+    mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
+    timeline: TenantTimelineId,
+    live_stats: Arc<LiveStats>,
+) {
+    // TODO: support sharding
+    let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
+
+    struct Timeline {
+        joinset: JoinSet<()>,
+        layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
+        concurrency: Arc<tokio::sync::Semaphore>,
+    }
+    loop {
+        debug!("restarting timeline");
+        let layer_map_info = mgmt_api_client
+            .layer_map_info(tenant_shard_id, timeline.timeline_id)
+            .await
+            .unwrap();
+        let concurrency = Arc::new(tokio::sync::Semaphore::new(
+            args.concurrency_per_target.get(),
+        ));
+
+        let mut joinset = JoinSet::new();
+        let layers = layer_map_info
+            .historic_layers
+            .into_iter()
+            .map(|historic_layer| {
+                let (tx, rx) = mpsc::channel(1);
+                joinset.spawn(layer_actor(
+                    tenant_shard_id,
+                    timeline.timeline_id,
+                    historic_layer,
+                    rx,
+                    Arc::clone(&mgmt_api_client),
+                    Arc::clone(&live_stats),
+                ));
+                tx
+            })
+            .collect::<Vec<_>>();
+
+        let mut timeline = Timeline {
+            joinset,
+            layers,
+            concurrency,
+        };
+
+        live_stats.timeline_restart_done();
+
+        loop {
+            assert!(!timeline.joinset.is_empty());
+            if let Some(res) = timeline.joinset.try_join_next() {
+                debug!(?res, "a layer actor exited, should not happen");
+                timeline.joinset.shutdown().await;
+                break;
+            }
+
+            let mut permit = Some(
+                Arc::clone(&timeline.concurrency)
+                    .acquire_owned()
+                    .await
+                    .unwrap(),
+            );
+
+            loop {
+                let layer_tx = {
+                    let mut rng = rand::thread_rng();
+                    timeline.layers.choose_mut(&mut rng).expect("no layers")
+                };
+                match layer_tx.try_send(permit.take().unwrap()) {
+                    Ok(_) => break,
+                    Err(e) => match e {
+                        mpsc::error::TrySendError::Full(back) => {
+                            // TODO: retrying introduces bias away from slow downloaders
+                            permit.replace(back);
+                        }
+                        mpsc::error::TrySendError::Closed(_) => panic!(),
+                    },
+                }
+            }
+        }
+    }
+}
+
+async fn layer_actor(
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+    mut layer: HistoricLayerInfo,
+    mut rx: mpsc::Receiver<tokio::sync::OwnedSemaphorePermit>,
+    mgmt_api_client: Arc<mgmt_api::Client>,
+    live_stats: Arc<LiveStats>,
+) {
+    #[derive(Clone, Copy)]
+    enum Action {
+        Evict,
+        OnDemandDownload,
+    }
+
+    while let Some(_permit) = rx.recv().await {
+        let action = if layer.is_remote() {
+            Action::OnDemandDownload
+        } else {
+            Action::Evict
+        };
+
+        let did_it = match action {
+            Action::Evict => {
+                let did_it = mgmt_api_client
+                    .layer_evict(tenant_shard_id, timeline_id, layer.layer_file_name())
+                    .await
+                    .unwrap();
+                live_stats.eviction_done();
+                did_it
+            }
+            Action::OnDemandDownload => {
+                let did_it = mgmt_api_client
+                    .layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
+                    .await
+                    .unwrap();
+                live_stats.download_done();
+                did_it
+            }
+        };
+        if !did_it {
+            debug!("local copy of layer map appears out of sync, re-downloading");
+            return;
+        }
+        debug!("did it");
+        layer.set_remote(match action {
+            Action::Evict => true,
+            Action::OnDemandDownload => false,
+        });
+    }
+}
diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs
index 5d688ed2d1..743102d853 100644
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -16,6 +16,7 @@ mod util {
 mod cmd {
     pub(super) mod basebackup;
     pub(super) mod getpage_latest_lsn;
+    pub(super) mod ondemand_download_churn;
     pub(super) mod trigger_initial_size_calculation;
 }
 
@@ -25,6 +26,7 @@ enum Args {
     Basebackup(cmd::basebackup::Args),
     GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
     TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
+    OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
 }
 
 fn main() {
@@ -43,6 +45,7 @@ fn main() {
         Args::TriggerInitialSizeCalculation(args) => {
             cmd::trigger_initial_size_calculation::main(args)
         }
+        Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
     }
     .unwrap()
 }
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 2f172bd384..1fd7c775d5 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -1,3 +1,5 @@
+#![recursion_limit = "300"]
+
 //! Main entry point for the Page Server executable.
 
 use std::env::{var, VarError};
@@ -118,6 +120,9 @@ fn main() -> anyhow::Result<()> {
         &[("node_id", &conf.id.to_string())],
     );
 
+    // after setting up logging, log the effective IO engine choice
+    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
+
     let tenants_path = conf.tenants_path();
     if !tenants_path.exists() {
         utils::crashsafe::create_dir_all(conf.tenants_path())
@@ -312,6 +317,7 @@ fn start_pageserver(
     let http_listener = tcp_listener::bind(http_addr)?;
 
     let pg_addr = &conf.listen_pg_addr;
+
     info!("Starting pageserver pg protocol handler on {pg_addr}");
     let pageserver_listener = tcp_listener::bind(pg_addr)?;
 
@@ -544,7 +550,7 @@ fn start_pageserver(
         let router_state = Arc::new(
             http::routes::State::new(
                 conf,
-                tenant_manager,
+                tenant_manager.clone(),
                 http_auth.clone(),
                 remote_storage.clone(),
                 broker_client.clone(),
@@ -688,6 +694,7 @@ fn start_pageserver(
                 let bg_remote_storage = remote_storage.clone();
                 let bg_deletion_queue = deletion_queue.clone();
                 BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
+                    &tenant_manager,
                     bg_remote_storage.map(|_| bg_deletion_queue),
                     0,
                 ));
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 70aa30d24e..89fda958ff 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -30,18 +30,17 @@ use utils::{
     logging::LogFormat,
 };
 
-use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
-use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{
     TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
-use crate::virtual_file;
+use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
+use crate::{tenant::config::TenantConf, virtual_file};
 use crate::{
     IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
-    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
+    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
 };
 
 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
@@ -287,16 +286,23 @@ pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
 
 // use dedicated enum for builder to better indicate the intention
 // and avoid possible confusion with nested options
+#[derive(Clone, Default)]
 pub enum BuilderValue<T> {
     Set(T),
+    #[default]
     NotSet,
 }
 
-impl<T> BuilderValue<T> {
-    pub fn ok_or<E>(self, err: E) -> Result<T, E> {
+impl<T: Clone> BuilderValue<T> {
+    pub fn ok_or(&self, field_name: &'static str, default: BuilderValue<T>) -> anyhow::Result<T> {
         match self {
-            Self::Set(v) => Ok(v),
-            Self::NotSet => Err(err),
+            Self::Set(v) => Ok(v.clone()),
+            Self::NotSet => match default {
+                BuilderValue::Set(v) => Ok(v.clone()),
+                BuilderValue::NotSet => {
+                    anyhow::bail!("missing config value {field_name:?}")
+                }
+            },
         }
     }
 }
@@ -322,6 +328,7 @@ pub(crate) struct NodeMetadata {
 }
 
 // needed to simplify config construction
+#[derive(Default)]
 struct PageServerConfigBuilder {
     listen_pg_addr: BuilderValue<String>,
 
@@ -388,8 +395,9 @@ struct PageServerConfigBuilder {
     validate_vectored_get: BuilderValue<bool>,
 }
 
-impl Default for PageServerConfigBuilder {
-    fn default() -> Self {
+impl PageServerConfigBuilder {
+    #[inline(always)]
+    fn default_values() -> Self {
         use self::BuilderValue::*;
         use defaults::*;
         Self {
@@ -636,122 +644,95 @@ impl PageServerConfigBuilder {
     }
 
     pub fn build(self) -> anyhow::Result<PageServerConf> {
-        let concurrent_tenant_warmup = self
-            .concurrent_tenant_warmup
-            .ok_or(anyhow!("missing concurrent_tenant_warmup"))?;
-        let concurrent_tenant_size_logical_size_queries = self
-            .concurrent_tenant_size_logical_size_queries
-            .ok_or(anyhow!(
-                "missing concurrent_tenant_size_logical_size_queries"
-            ))?;
-        Ok(PageServerConf {
-            listen_pg_addr: self
-                .listen_pg_addr
-                .ok_or(anyhow!("missing listen_pg_addr"))?,
-            listen_http_addr: self
-                .listen_http_addr
-                .ok_or(anyhow!("missing listen_http_addr"))?,
-            availability_zone: self
-                .availability_zone
-                .ok_or(anyhow!("missing availability_zone"))?,
-            wait_lsn_timeout: self
-                .wait_lsn_timeout
-                .ok_or(anyhow!("missing wait_lsn_timeout"))?,
-            superuser: self.superuser.ok_or(anyhow!("missing superuser"))?,
-            page_cache_size: self
-                .page_cache_size
-                .ok_or(anyhow!("missing page_cache_size"))?,
-            max_file_descriptors: self
-                .max_file_descriptors
-                .ok_or(anyhow!("missing max_file_descriptors"))?,
-            workdir: self.workdir.ok_or(anyhow!("missing workdir"))?,
-            pg_distrib_dir: self
-                .pg_distrib_dir
-                .ok_or(anyhow!("missing pg_distrib_dir"))?,
-            http_auth_type: self
-                .http_auth_type
-                .ok_or(anyhow!("missing http_auth_type"))?,
-            pg_auth_type: self.pg_auth_type.ok_or(anyhow!("missing pg_auth_type"))?,
-            auth_validation_public_key_path: self
-                .auth_validation_public_key_path
-                .ok_or(anyhow!("missing auth_validation_public_key_path"))?,
-            remote_storage_config: self
-                .remote_storage_config
-                .ok_or(anyhow!("missing remote_storage_config"))?,
-            id: self.id.ok_or(anyhow!("missing id"))?,
-            // TenantConf is handled separately
-            default_tenant_conf: TenantConf::default(),
-            broker_endpoint: self
-                .broker_endpoint
-                .ok_or(anyhow!("No broker endpoints provided"))?,
-            broker_keepalive_interval: self
-                .broker_keepalive_interval
-                .ok_or(anyhow!("No broker keepalive interval provided"))?,
-            log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
-            concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup),
-            concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
-                concurrent_tenant_size_logical_size_queries,
-            ),
-            eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
-                concurrent_tenant_size_logical_size_queries,
-            ),
-            metric_collection_interval: self
-                .metric_collection_interval
-                .ok_or(anyhow!("missing metric_collection_interval"))?,
-            cached_metric_collection_interval: self
-                .cached_metric_collection_interval
-                .ok_or(anyhow!("missing cached_metric_collection_interval"))?,
-            metric_collection_endpoint: self
-                .metric_collection_endpoint
-                .ok_or(anyhow!("missing metric_collection_endpoint"))?,
-            synthetic_size_calculation_interval: self
-                .synthetic_size_calculation_interval
-                .ok_or(anyhow!("missing synthetic_size_calculation_interval"))?,
-            disk_usage_based_eviction: self
-                .disk_usage_based_eviction
-                .ok_or(anyhow!("missing disk_usage_based_eviction"))?,
-            test_remote_failures: self
-                .test_remote_failures
-                .ok_or(anyhow!("missing test_remote_failuers"))?,
-            ondemand_download_behavior_treat_error_as_warn: self
-                .ondemand_download_behavior_treat_error_as_warn
-                .ok_or(anyhow!(
-                    "missing ondemand_download_behavior_treat_error_as_warn"
-                ))?,
-            background_task_maximum_delay: self
-                .background_task_maximum_delay
-                .ok_or(anyhow!("missing background_task_maximum_delay"))?,
-            control_plane_api: self
-                .control_plane_api
-                .ok_or(anyhow!("missing control_plane_api"))?,
-            control_plane_api_token: self
-                .control_plane_api_token
-                .ok_or(anyhow!("missing control_plane_api_token"))?,
-            control_plane_emergency_mode: self
-                .control_plane_emergency_mode
-                .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
-            heatmap_upload_concurrency: self
-                .heatmap_upload_concurrency
-                .ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
-            secondary_download_concurrency: self
-                .secondary_download_concurrency
-                .ok_or(anyhow!("missing secondary_download_concurrency"))?,
-            ingest_batch_size: self
-                .ingest_batch_size
-                .ok_or(anyhow!("missing ingest_batch_size"))?,
-            virtual_file_io_engine: self
-                .virtual_file_io_engine
-                .ok_or(anyhow!("missing virtual_file_io_engine"))?,
-            get_vectored_impl: self
-                .get_vectored_impl
-                .ok_or(anyhow!("missing get_vectored_impl"))?,
-            max_vectored_read_bytes: self
-                .max_vectored_read_bytes
-                .ok_or(anyhow!("missing max_vectored_read_bytes"))?,
-            validate_vectored_get: self
-                .validate_vectored_get
-                .ok_or(anyhow!("missing validate_vectored_get"))?,
-        })
+        let default = Self::default_values();
+
+        macro_rules! conf {
+            (USING DEFAULT { $($field:ident,)* } CUSTOM LOGIC { $($custom_field:ident : $custom_value:expr,)* } ) => {
+                PageServerConf {
+                    $(
+                        $field: self.$field.ok_or(stringify!($field), default.$field)?,
+                    )*
+                    $(
+                        $custom_field: $custom_value,
+                    )*
+                }
+            };
+        }
+
+        Ok(conf!(
+            USING DEFAULT
+            {
+                listen_pg_addr,
+                listen_http_addr,
+                availability_zone,
+                wait_lsn_timeout,
+                superuser,
+                page_cache_size,
+                max_file_descriptors,
+                workdir,
+                pg_distrib_dir,
+                http_auth_type,
+                pg_auth_type,
+                auth_validation_public_key_path,
+                remote_storage_config,
+                id,
+                broker_endpoint,
+                broker_keepalive_interval,
+                log_format,
+                metric_collection_interval,
+                cached_metric_collection_interval,
+                metric_collection_endpoint,
+                synthetic_size_calculation_interval,
+                disk_usage_based_eviction,
+                test_remote_failures,
+                ondemand_download_behavior_treat_error_as_warn,
+                background_task_maximum_delay,
+                control_plane_api,
+                control_plane_api_token,
+                control_plane_emergency_mode,
+                heatmap_upload_concurrency,
+                secondary_download_concurrency,
+                ingest_batch_size,
+                get_vectored_impl,
+                max_vectored_read_bytes,
+                validate_vectored_get,
+            }
+            CUSTOM LOGIC
+            {
+                // TenantConf is handled separately
+                default_tenant_conf: TenantConf::default(),
+                concurrent_tenant_warmup: ConfigurableSemaphore::new({
+                    self
+                        .concurrent_tenant_warmup
+                        .ok_or("concurrent_tenant_warmpup",
+                               default.concurrent_tenant_warmup)?
+                }),
+                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
+                    self
+                        .concurrent_tenant_size_logical_size_queries
+                        .ok_or("concurrent_tenant_size_logical_size_queries",
+                               default.concurrent_tenant_size_logical_size_queries.clone())?
+                ),
+                eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
+                    // re-use `concurrent_tenant_size_logical_size_queries`
+                    self
+                        .concurrent_tenant_size_logical_size_queries
+                        .ok_or("eviction_task_immitated_concurrent_logical_size_queries",
+                               default.concurrent_tenant_size_logical_size_queries.clone())?,
+                ),
+                virtual_file_io_engine: match self.virtual_file_io_engine {
+                    BuilderValue::Set(v) => v,
+                    BuilderValue::NotSet => match crate::virtual_file::io_engine_feature_test().context("auto-detect virtual_file_io_engine")? {
+                        io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise
+                        io_engine::FeatureTestResult::Worse { engine, remark } => {
+                            // TODO: bubble this up to the caller so we can tracing::warn! it.
+                            eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}");
+                            engine
+                        }
+                    },
+                },
+            }
+        ))
     }
 }
 
@@ -831,18 +812,7 @@ impl PageServerConf {
             .join(timeline_id.to_string())
     }
 
-    pub fn timeline_uninit_mark_file_path(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-    ) -> Utf8PathBuf {
-        path_with_suffix_extension(
-            self.timeline_path(&tenant_shard_id, &timeline_id),
-            TIMELINE_UNINIT_MARK_SUFFIX,
-        )
-    }
-
-    pub fn timeline_delete_mark_file_path(
+    pub(crate) fn timeline_delete_mark_file_path(
         &self,
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
@@ -853,7 +823,10 @@ impl PageServerConf {
         )
     }
 
-    pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+    pub(crate) fn tenant_deleted_mark_file_path(
+        &self,
+        tenant_shard_id: &TenantShardId,
+    ) -> Utf8PathBuf {
         self.tenant_path(tenant_shard_id)
             .join(TENANT_DELETED_MARKER_FILE_NAME)
     }
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 6a070e2135..0771229845 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -567,9 +567,9 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/ServiceUnavailableError"
-  /v1/tenant/{tenant_id}/location_config:
+  /v1/tenant/{tenant_shard_id}/location_config:
     parameters:
-      - name: tenant_id
+      - name: tenant_shard_id
         in: path
         required: true
         schema:
@@ -965,12 +965,28 @@ paths:
         required: true
         schema:
           type: string
+      - name: wait_ms
+        description: If set, we will wait this long for download to complete, and if it isn't complete then return 202
+        in: query
+        required: false
+        schema:
+          type: integer
     post:
       description: |
         If the location is in secondary mode, download latest heatmap and layers
       responses:
         "200":
           description: Success
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/SecondaryProgress"
+        "202":
+          description: Download has started but not yet finished
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/SecondaryProgress"
         "500":
           description: Generic operation error
           content:
@@ -1367,10 +1383,11 @@ components:
     TenantLocationConfigRequest:
       type: object
       required:
-        - tenant_id
+        - mode
       properties:
         tenant_id:
           type: string
+          description: Not used, scheduled for removal.
         mode:
           type: string
           enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
@@ -1622,6 +1639,37 @@ components:
             Lower is better score for how good this pageserver would be for the next tenant.
             The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated.
 
+    SecondaryProgress:
+      type: object
+      required:
+        - heatmap_mtime
+        - layers_downloaded
+        - layers_total
+        - bytes_downloaded
+        - bytes_total
+      properties:
+        heatmap_mtime:
+          type: string
+          format: date-time
+          description: Modification time of the most recently downloaded layer heatmap (RFC 3339 format)
+        layers_downloaded:
+          type: integer
+          format: int64
+          description: How many layers from the latest layer heatmap are present on disk
+        bytes_downloaded:
+          type: integer
+          format: int64
+          description: How many bytes of layer content from the latest layer heatmap are present on disk
+        layers_total:
+          type: integer
+          format: int64
+          description: How many layers were in the latest layer heatmap
+        bytes_total:
+          type: integer
+          format: int64
+          description: How many bytes of layer content were in the latest layer heatmap
+
+
     Error:
       type: object
       required:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index bb8b1bb7e5..229f3ae98f 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -535,9 +535,9 @@ async fn timeline_create_handler(
                 )
             }
             Err(
-                tenant::CreateTimelineError::Conflict
-                | tenant::CreateTimelineError::AlreadyCreating,
-            ) => json_response(StatusCode::CONFLICT, ()),
+                e @ tenant::CreateTimelineError::Conflict
+                | e @ tenant::CreateTimelineError::AlreadyCreating,
+            ) => json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string())),
             Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
                 StatusCode::NOT_ACCEPTABLE,
                 HttpErrorBody::from_msg(format!("{err:#}")),
@@ -885,14 +885,16 @@ async fn tenant_detach_handler(
 
     let state = get_state(&request);
     let conf = state.conf;
-    mgr::detach_tenant(
-        conf,
-        tenant_shard_id,
-        detach_ignored.unwrap_or(false),
-        &state.deletion_queue_client,
-    )
-    .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
-    .await?;
+    state
+        .tenant_manager
+        .detach_tenant(
+            conf,
+            tenant_shard_id,
+            detach_ignored.unwrap_or(false),
+            &state.deletion_queue_client,
+        )
+        .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
+        .await?;
 
     json_response(StatusCode::OK, ())
 }
@@ -1403,7 +1405,9 @@ async fn update_tenant_config_handler(
         TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
 
     let state = get_state(&request);
-    mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
+    state
+        .tenant_manager
+        .set_new_tenant_config(tenant_conf, tenant_id)
         .instrument(info_span!("tenant_config", %tenant_id))
         .await?;
 
@@ -1428,13 +1432,14 @@ async fn put_tenant_location_config_handler(
     // The `Detached` state is special, it doesn't upsert a tenant, it removes
     // its local disk content and drops it from memory.
     if let LocationConfigMode::Detached = request_data.config.mode {
-        if let Err(e) =
-            mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
-                .instrument(info_span!("tenant_detach",
-                    tenant_id = %tenant_shard_id.tenant_id,
-                    shard_id = %tenant_shard_id.shard_slug()
-                ))
-                .await
+        if let Err(e) = state
+            .tenant_manager
+            .detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
+            .instrument(info_span!("tenant_detach",
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug()
+            ))
+            .await
         {
             match e {
                 TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
@@ -1648,8 +1653,7 @@ async fn timeline_gc_handler(
     let gc_req: TimelineGcRequest = json_request(&mut request).await?;
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let wait_task_done =
-        mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
+    let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?;
     let gc_result = wait_task_done
         .await
         .context("wait for gc task")
@@ -1987,13 +1991,42 @@ async fn secondary_download_handler(
 ) -> Result<Response<Body>, ApiError> {
     let state = get_state(&request);
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    state
-        .secondary_controller
-        .download_tenant(tenant_shard_id)
-        .await
-        .map_err(ApiError::InternalServerError)?;
+    let wait = parse_query_param(&request, "wait_ms")?.map(Duration::from_millis);
 
-    json_response(StatusCode::OK, ())
+    // We don't need this to issue the download request, but:
+    // - it enables us to cleanly return 404 if we get a request for an absent shard
+    // - we will use this to provide status feedback in the response
+    let Some(secondary_tenant) = state
+        .tenant_manager
+        .get_secondary_tenant_shard(tenant_shard_id)
+    else {
+        return Err(ApiError::NotFound(
+            anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(),
+        ));
+    };
+
+    let timeout = wait.unwrap_or(Duration::MAX);
+
+    let status = match tokio::time::timeout(
+        timeout,
+        state.secondary_controller.download_tenant(tenant_shard_id),
+    )
+    .await
+    {
+        // Download job ran to completion.
+        Ok(Ok(())) => StatusCode::OK,
+        // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
+        // okay.  We could get an error here in the unlikely edge case that the tenant
+        // was detached between our check above and executing the download job.
+        Ok(Err(e)) => return Err(ApiError::InternalServerError(e)),
+        // A timeout is not an error: we have started the download, we're just not done
+        // yet.  The caller will get a response body indicating status.
+        Err(_) => StatusCode::ACCEPTED,
+    };
+
+    let progress = secondary_tenant.progress.lock().unwrap().clone();
+
+    json_response(status, progress)
 }
 
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -2053,6 +2086,10 @@ async fn get_utilization(
     r: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
+    fail::fail_point!("get-utilization-http-handler", |_| {
+        Err(ApiError::ResourceUnavailable("failpoint".into()))
+    });
+
     // this probably could be completely public, but lets make that change later.
     check_permission(&r, None)?;
 
@@ -2108,6 +2145,16 @@ where
     R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
     H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
 {
+    if request.uri() != &"/v1/failpoints".parse::<Uri>().unwrap() {
+        fail::fail_point!("api-503", |_| Err(ApiError::ResourceUnavailable(
+            "failpoint".into()
+        )));
+
+        fail::fail_point!("api-500", |_| Err(ApiError::InternalServerError(
+            anyhow::anyhow!("failpoint")
+        )));
+    }
+
     // Spawn a new task to handle the request, to protect the handler from unexpected
     // async cancellations. Most pageserver functions are not async cancellation safe.
     // We arm a drop-guard, so that if Hyper drops the Future, we signal the task
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index d66df36b3a..343dec2ca1 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -2,28 +2,20 @@
 //! Import data and WAL from a PostgreSQL data directory and WAL segments into
 //! a neon Timeline.
 //!
-use std::io::SeekFrom;
 use std::path::{Path, PathBuf};
 
 use anyhow::{bail, ensure, Context, Result};
-use async_compression::tokio::bufread::ZstdDecoder;
-use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
 use bytes::Bytes;
 use camino::Utf8Path;
 use futures::StreamExt;
-use nix::NixPath;
-use tokio::fs::{File, OpenOptions};
-use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
+use tokio::io::{AsyncRead, AsyncReadExt};
 use tokio_tar::Archive;
-use tokio_tar::Builder;
-use tokio_tar::HeaderMode;
 use tracing::*;
 use walkdir::WalkDir;
 
 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
-use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
 use crate::walrecord::DecodedWALRecord;
@@ -633,65 +625,3 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
     reader.read_to_end(&mut buf).await?;
     Ok(Bytes::from(buf))
 }
-
-pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
-    let file = OpenOptions::new()
-        .create(true)
-        .truncate(true)
-        .read(true)
-        .write(true)
-        .open(&tmp_path)
-        .await
-        .with_context(|| format!("tempfile creation {tmp_path}"))?;
-
-    let mut paths = Vec::new();
-    for entry in WalkDir::new(pgdata_path) {
-        let entry = entry?;
-        let metadata = entry.metadata().expect("error getting dir entry metadata");
-        // Also allow directories so that we also get empty directories
-        if !(metadata.is_file() || metadata.is_dir()) {
-            continue;
-        }
-        let path = entry.into_path();
-        paths.push(path);
-    }
-    // Do a sort to get a more consistent listing
-    paths.sort_unstable();
-    let zstd = ZstdEncoder::with_quality_and_params(
-        file,
-        Level::Default,
-        &[CParameter::enable_long_distance_matching(true)],
-    );
-    let mut builder = Builder::new(zstd);
-    // Use reproducible header mode
-    builder.mode(HeaderMode::Deterministic);
-    for path in paths {
-        let rel_path = path.strip_prefix(pgdata_path)?;
-        if rel_path.is_empty() {
-            // The top directory should not be compressed,
-            // the tar crate doesn't like that
-            continue;
-        }
-        builder.append_path_with_name(&path, rel_path).await?;
-    }
-    let mut zstd = builder.into_inner().await?;
-    zstd.shutdown().await?;
-    let mut compressed = zstd.into_inner();
-    let compressed_len = compressed.metadata().await?.len();
-    const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
-    if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
-        warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
-    }
-    compressed.seek(SeekFrom::Start(0)).await?;
-    Ok((compressed, compressed_len))
-}
-
-pub async fn extract_tar_zst(
-    pgdata_path: &Utf8Path,
-    tar_zst: impl AsyncBufRead + Unpin,
-) -> Result<()> {
-    let tar = Box::pin(ZstdDecoder::new(tar_zst));
-    let mut archive = Archive::new(tar);
-    archive.unpack(pgdata_path).await?;
-    Ok(())
-}
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 02a690d4e1..f947a75f61 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -31,6 +31,7 @@ pub mod walredo;
 use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
+use tenant::mgr::TenantManager;
 use tracing::info;
 
 /// Current storage format version
@@ -53,7 +54,11 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
 pub use crate::metrics::preinitialize_metrics;
 
 #[tracing::instrument(skip_all, fields(%exit_code))]
-pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
+pub async fn shutdown_pageserver(
+    tenant_manager: &TenantManager,
+    deletion_queue: Option<DeletionQueue>,
+    exit_code: i32,
+) {
     use std::time::Duration;
     // Shut down the libpq endpoint task. This prevents new connections from
     // being accepted.
@@ -67,7 +72,7 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
     // Shut down all the tenants. This flushes everything to disk and kills
     // the checkpoint and GC tasks.
     timed(
-        tenant::mgr::shutdown_all_tenants(),
+        tenant_manager.shutdown(),
         "shutdown all tenants",
         Duration::from_secs(5),
     )
@@ -114,27 +119,27 @@ pub const METADATA_FILE_NAME: &str = "metadata";
 
 /// Per-tenant configuration file.
 /// Full path: `tenants/<tenant_id>/config`.
-pub const TENANT_CONFIG_NAME: &str = "config";
+pub(crate) const TENANT_CONFIG_NAME: &str = "config";
 
 /// Per-tenant configuration file.
 /// Full path: `tenants/<tenant_id>/config`.
-pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
+pub(crate) const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
 
 /// Per-tenant copy of their remote heatmap, downloaded into the local
 /// tenant path while in secondary mode.
-pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
+pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
 
 /// A suffix used for various temporary files. Any temporary files found in the
 /// data directory at pageserver startup can be automatically removed.
-pub const TEMP_FILE_SUFFIX: &str = "___temp";
+pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp";
 
 /// A marker file to mark that a timeline directory was not fully initialized.
 /// If a timeline directory with this marker is encountered at pageserver startup,
 /// the timeline directory and the marker file are both removed.
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
-pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
+pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
 
-pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
+pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
 
 /// A marker file to prevent pageserver from loading a certain tenant on restart.
 /// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
@@ -161,11 +166,11 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
 // from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
 // from the name.
 
-pub fn is_uninit_mark(path: &Utf8Path) -> bool {
+pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool {
     ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
 }
 
-pub fn is_delete_mark(path: &Utf8Path) -> bool {
+pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool {
     ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
 }
 
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 814b3e1f96..075bb76a1b 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -167,7 +167,7 @@ impl GetVectoredLatency {
 pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
     let inner = register_histogram_vec!(
         "pageserver_get_vectored_seconds",
-        "Time spent in get_vectored",
+        "Time spent in get_vectored, excluding time spent in timeline_get_throttle.",
         &["task_kind"],
         CRITICAL_OP_BUCKETS.into(),
     )
@@ -2465,7 +2465,8 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
 }
 
 pub mod tokio_epoll_uring {
-    use metrics::UIntGauge;
+    use metrics::{register_int_counter, UIntGauge};
+    use once_cell::sync::Lazy;
 
     pub struct Collector {
         descs: Vec<metrics::core::Desc>,
@@ -2473,15 +2474,13 @@ pub mod tokio_epoll_uring {
         systems_destroyed: UIntGauge,
     }
 
-    const NMETRICS: usize = 2;
-
     impl metrics::core::Collector for Collector {
         fn desc(&self) -> Vec<&metrics::core::Desc> {
             self.descs.iter().collect()
         }
 
         fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
-            let mut mfs = Vec::with_capacity(NMETRICS);
+            let mut mfs = Vec::with_capacity(Self::NMETRICS);
             let tokio_epoll_uring::metrics::Metrics {
                 systems_created,
                 systems_destroyed,
@@ -2495,6 +2494,8 @@ pub mod tokio_epoll_uring {
     }
 
     impl Collector {
+        const NMETRICS: usize = 2;
+
         #[allow(clippy::new_without_default)]
         pub fn new() -> Self {
             let mut descs = Vec::new();
@@ -2528,6 +2529,22 @@ pub mod tokio_epoll_uring {
             }
         }
     }
+
+    pub(crate) static THREAD_LOCAL_LAUNCH_SUCCESSES: Lazy<metrics::IntCounter> = Lazy::new(|| {
+        register_int_counter!(
+            "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_success_count",
+            "Number of times where thread_local_system creation spanned multiple executor threads",
+        )
+        .unwrap()
+    });
+
+    pub(crate) static THREAD_LOCAL_LAUNCH_FAILURES: Lazy<metrics::IntCounter> = Lazy::new(|| {
+        register_int_counter!(
+            "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_failures_count",
+            "Number of times thread_local_system creation failed and was retried after back-off.",
+        )
+        .unwrap()
+    });
 }
 
 pub(crate) mod tenant_throttling {
@@ -2656,6 +2673,8 @@ pub fn preinitialize_metrics() {
         &WALRECEIVER_BROKER_UPDATES,
         &WALRECEIVER_CANDIDATES_ADDED,
         &WALRECEIVER_CANDIDATES_REMOVED,
+        &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES,
+        &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
     ]
     .into_iter()
     .for_each(|c| {
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 275a72c0b0..69e163effa 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -50,8 +50,6 @@ use once_cell::sync::Lazy;
 
 use utils::id::TimelineId;
 
-use crate::shutdown_pageserver;
-
 //
 // There are four runtimes:
 //
@@ -453,7 +451,7 @@ async fn task_finish(
     }
 
     if shutdown_process {
-        shutdown_pageserver(None, 1).await;
+        std::process::exit(1);
     }
 }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f0996328c0..7a6ddd6a4e 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -43,6 +43,8 @@ use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
 use utils::timeout::timeout_cancellable;
 use utils::timeout::TimeoutCancellableError;
+use utils::zstd::create_zst_tarball;
+use utils::zstd::extract_zst_tarball;
 
 use self::config::AttachedLocationConfig;
 use self::config::AttachmentMode;
@@ -55,8 +57,8 @@ use self::mgr::GetTenantError;
 use self::mgr::TenantsMap;
 use self::remote_timeline_client::upload::upload_index_part;
 use self::remote_timeline_client::RemoteTimelineClient;
+use self::timeline::uninit::TimelineCreateGuard;
 use self::timeline::uninit::TimelineExclusionError;
-use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
@@ -565,9 +567,8 @@ impl Tenant {
             // avoiding holding it across awaits
             let mut timelines_accessor = self.timelines.lock().unwrap();
             match timelines_accessor.entry(timeline_id) {
+                // We should never try and load the same timeline twice during startup
                 Entry::Occupied(_) => {
-                    // The uninit mark file acts as a lock that prevents another task from
-                    // initializing the timeline at the same time.
                     unreachable!(
                         "Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
                     );
@@ -1064,8 +1065,7 @@ impl Tenant {
             let entry_path = entry.path();
 
             let purge = if crate::is_temporary(entry_path)
-                // TODO: uninit_mark isn't needed any more, since uninitialized timelines are already
-                // covered by the check that the timeline must exist in remote storage.
+                // TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718)
                 || is_uninit_mark(entry_path)
                 || crate::is_delete_mark(entry_path)
             {
@@ -1298,11 +1298,6 @@ impl Tenant {
     /// Until that happens, the on-disk state is invalid (disk_consistent_lsn=Lsn(0))
     /// and the timeline will fail to load at a restart.
     ///
-    /// That's why we add an uninit mark file, and wrap it together witht the Timeline
-    /// in-memory object into UninitializedTimeline.
-    /// Once the caller is done setting up the timeline, they should call
-    /// `UninitializedTimeline::initialize_with_lock` to remove the uninit mark.
-    ///
     /// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
     /// minimum amount of keys required to get a writable timeline.
     /// (Without it, `put` might fail due to `repartition` failing.)
@@ -1318,7 +1313,9 @@ impl Tenant {
             "Cannot create empty timelines on inactive tenant"
         );
 
-        let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id)?;
+        // Protect against concurrent attempts to use this TimelineId
+        let create_guard = self.create_timeline_create_guard(new_timeline_id)?;
+
         let new_metadata = TimelineMetadata::new(
             // Initialize disk_consistent LSN to 0, The caller must import some data to
             // make it valid, before calling finish_creation()
@@ -1333,7 +1330,7 @@ impl Tenant {
         self.prepare_new_timeline(
             new_timeline_id,
             &new_metadata,
-            timeline_uninit_mark,
+            create_guard,
             initdb_lsn,
             None,
         )
@@ -1421,9 +1418,8 @@ impl Tenant {
             .map_err(|_| CreateTimelineError::ShuttingDown)?;
 
         // Get exclusive access to the timeline ID: this ensures that it does not already exist,
-        // and that no other creation attempts will be allowed in while we are working.  The
-        // uninit_mark is a guard.
-        let uninit_mark = match self.create_timeline_uninit_mark(new_timeline_id) {
+        // and that no other creation attempts will be allowed in while we are working.
+        let create_guard = match self.create_timeline_create_guard(new_timeline_id) {
             Ok(m) => m,
             Err(TimelineExclusionError::AlreadyCreating) => {
                 // Creation is in progress, we cannot create it again, and we cannot
@@ -1466,6 +1462,8 @@ impl Tenant {
             }
         };
 
+        pausable_failpoint!("timeline-creation-after-uninit");
+
         let loaded_timeline = match ancestor_timeline_id {
             Some(ancestor_timeline_id) => {
                 let ancestor_timeline = self
@@ -1513,7 +1511,7 @@ impl Tenant {
                     &ancestor_timeline,
                     new_timeline_id,
                     ancestor_start_lsn,
-                    uninit_mark,
+                    create_guard,
                     ctx,
                 )
                 .await?
@@ -1523,7 +1521,7 @@ impl Tenant {
                     new_timeline_id,
                     pg_version,
                     load_existing_initdb,
-                    uninit_mark,
+                    create_guard,
                     ctx,
                 )
                 .await?
@@ -2870,9 +2868,9 @@ impl Tenant {
         start_lsn: Option<Lsn>,
         ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, CreateTimelineError> {
-        let uninit_mark = self.create_timeline_uninit_mark(dst_id).unwrap();
+        let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
         let tl = self
-            .branch_timeline_impl(src_timeline, dst_id, start_lsn, uninit_mark, ctx)
+            .branch_timeline_impl(src_timeline, dst_id, start_lsn, create_guard, ctx)
             .await?;
         tl.set_state(TimelineState::Active);
         Ok(tl)
@@ -2886,10 +2884,10 @@ impl Tenant {
         src_timeline: &Arc<Timeline>,
         dst_id: TimelineId,
         start_lsn: Option<Lsn>,
-        timeline_uninit_mark: TimelineUninitMark<'_>,
+        timeline_create_guard: TimelineCreateGuard<'_>,
         ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, CreateTimelineError> {
-        self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_uninit_mark, ctx)
+        self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_create_guard, ctx)
             .await
     }
 
@@ -2898,7 +2896,7 @@ impl Tenant {
         src_timeline: &Arc<Timeline>,
         dst_id: TimelineId,
         start_lsn: Option<Lsn>,
-        timeline_uninit_mark: TimelineUninitMark<'_>,
+        timeline_create_guard: TimelineCreateGuard<'_>,
         _ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, CreateTimelineError> {
         let src_id = src_timeline.timeline_id;
@@ -2982,7 +2980,7 @@ impl Tenant {
             .prepare_new_timeline(
                 dst_id,
                 &metadata,
-                timeline_uninit_mark,
+                timeline_create_guard,
                 start_lsn + 1,
                 Some(Arc::clone(src_timeline)),
             )
@@ -3014,12 +3012,12 @@ impl Tenant {
         load_existing_initdb: Option<TimelineId>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
-        let uninit_mark = self.create_timeline_uninit_mark(timeline_id).unwrap();
+        let create_guard = self.create_timeline_create_guard(timeline_id).unwrap();
         self.bootstrap_timeline(
             timeline_id,
             pg_version,
             load_existing_initdb,
-            uninit_mark,
+            create_guard,
             ctx,
         )
         .await
@@ -3046,8 +3044,13 @@ impl Tenant {
             }
         }
 
-        let (pgdata_zstd, tar_zst_size) =
-            import_datadir::create_tar_zst(pgdata_path, &temp_path).await?;
+        let (pgdata_zstd, tar_zst_size) = create_zst_tarball(pgdata_path, &temp_path).await?;
+        const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
+        if tar_zst_size > INITDB_TAR_ZST_WARN_LIMIT {
+            warn!(
+                "compressed {temp_path} size of {tar_zst_size} is above limit {INITDB_TAR_ZST_WARN_LIMIT}."
+            );
+        }
 
         pausable_failpoint!("before-initdb-upload");
 
@@ -3083,7 +3086,7 @@ impl Tenant {
         timeline_id: TimelineId,
         pg_version: u32,
         load_existing_initdb: Option<TimelineId>,
-        timeline_uninit_mark: TimelineUninitMark<'_>,
+        timeline_create_guard: TimelineCreateGuard<'_>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
         // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
@@ -3095,13 +3098,14 @@ impl Tenant {
             TEMP_FILE_SUFFIX,
         );
 
-        // an uninit mark was placed before, nothing else can access this timeline files
-        // current initdb was not run yet, so remove whatever was left from the previous runs
+        // Remove whatever was left from the previous runs: safe because TimelineCreateGuard guarantees
+        // we won't race with other creations or existent timelines with the same path.
         if pgdata_path.exists() {
             fs::remove_dir_all(&pgdata_path).with_context(|| {
                 format!("Failed to remove already existing initdb directory: {pgdata_path}")
             })?;
         }
+
         // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
         scopeguard::defer! {
             if let Err(e) = fs::remove_dir_all(&pgdata_path) {
@@ -3146,7 +3150,7 @@ impl Tenant {
 
             let buf_read =
                 BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
-            import_datadir::extract_tar_zst(&pgdata_path, buf_read)
+            extract_zst_tarball(&pgdata_path, buf_read)
                 .await
                 .context("extract initdb tar")?;
         } else {
@@ -3178,7 +3182,7 @@ impl Tenant {
             .prepare_new_timeline(
                 timeline_id,
                 &new_metadata,
-                timeline_uninit_mark,
+                timeline_create_guard,
                 pgdata_lsn,
                 None,
             )
@@ -3250,13 +3254,12 @@ impl Tenant {
     ///
     /// An empty layer map is initialized, and new data and WAL can be imported starting
     /// at 'disk_consistent_lsn'. After any initial data has been imported, call
-    /// `finish_creation` to insert the Timeline into the timelines map and to remove the
-    /// uninit mark file.
+    /// `finish_creation` to insert the Timeline into the timelines map.
     async fn prepare_new_timeline<'a>(
         &'a self,
         new_timeline_id: TimelineId,
         new_metadata: &TimelineMetadata,
-        uninit_mark: TimelineUninitMark<'a>,
+        create_guard: TimelineCreateGuard<'a>,
         start_lsn: Lsn,
         ancestor: Option<Arc<Timeline>>,
     ) -> anyhow::Result<UninitializedTimeline> {
@@ -3279,9 +3282,12 @@ impl Tenant {
 
         timeline_struct.init_empty_layer_map(start_lsn);
 
-        if let Err(e) = self.create_timeline_files(&uninit_mark.timeline_path).await {
+        if let Err(e) = self
+            .create_timeline_files(&create_guard.timeline_path)
+            .await
+        {
             error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}");
-            cleanup_timeline_directory(uninit_mark);
+            cleanup_timeline_directory(create_guard);
             return Err(e);
         }
 
@@ -3292,41 +3298,31 @@ impl Tenant {
         Ok(UninitializedTimeline::new(
             self,
             new_timeline_id,
-            Some((timeline_struct, uninit_mark)),
+            Some((timeline_struct, create_guard)),
         ))
     }
 
     async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> {
         crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;
 
-        fail::fail_point!("after-timeline-uninit-mark-creation", |_| {
-            anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
+        fail::fail_point!("after-timeline-dir-creation", |_| {
+            anyhow::bail!("failpoint after-timeline-dir-creation");
         });
 
         Ok(())
     }
 
-    /// Attempts to create an uninit mark file for the timeline initialization.
-    /// Bails, if the timeline is already loaded into the memory (i.e. initialized before), or the uninit mark file already exists.
-    ///
-    /// This way, we need to hold the timelines lock only for small amount of time during the mark check/creation per timeline init.
-    fn create_timeline_uninit_mark(
+    /// Get a guard that provides exclusive access to the timeline directory, preventing
+    /// concurrent attempts to create the same timeline.
+    fn create_timeline_create_guard(
         &self,
         timeline_id: TimelineId,
-    ) -> Result<TimelineUninitMark, TimelineExclusionError> {
+    ) -> Result<TimelineCreateGuard, TimelineExclusionError> {
         let tenant_shard_id = self.tenant_shard_id;
 
-        let uninit_mark_path = self
-            .conf
-            .timeline_uninit_mark_file_path(tenant_shard_id, timeline_id);
         let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
 
-        let uninit_mark = TimelineUninitMark::new(
-            self,
-            timeline_id,
-            uninit_mark_path.clone(),
-            timeline_path.clone(),
-        )?;
+        let create_guard = TimelineCreateGuard::new(self, timeline_id, timeline_path.clone())?;
 
         // At this stage, we have got exclusive access to in-memory state for this timeline ID
         // for creation.
@@ -3342,23 +3338,7 @@ impl Tenant {
             )));
         }
 
-        // Create the on-disk uninit mark _after_ the in-memory acquisition of the tenant ID: guarantees
-        // that during process runtime, colliding creations will be caught in-memory without getting
-        // as far as failing to write a file.
-        fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .open(&uninit_mark_path)
-            .context("Failed to create uninit mark file")
-            .and_then(|_| {
-                crashsafe::fsync_file_and_parent(&uninit_mark_path)
-                    .context("Failed to fsync uninit mark file")
-            })
-            .with_context(|| {
-                format!("Failed to crate uninit mark for timeline {tenant_shard_id}/{timeline_id}")
-            })?;
-
-        Ok(uninit_mark)
+        Ok(create_guard)
     }
 
     /// Gathers inputs from all of the timelines to produce a sizing model input.
@@ -5099,15 +5079,15 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_uninit_mark_crash() -> anyhow::Result<()> {
-        let name = "test_uninit_mark_crash";
+    async fn test_create_guard_crash() -> anyhow::Result<()> {
+        let name = "test_create_guard_crash";
         let harness = TenantHarness::create(name)?;
         {
             let (tenant, ctx) = harness.load().await;
             let tline = tenant
                 .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
                 .await?;
-            // Keeps uninit mark in place
+            // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
             let raw_tline = tline.raw_timeline().unwrap();
             raw_tline
                 .shutdown()
@@ -5135,11 +5115,6 @@ mod tests {
             .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID)
             .exists());
 
-        assert!(!harness
-            .conf
-            .timeline_uninit_mark_file_path(tenant.tenant_shard_id, TIMELINE_ID)
-            .exists());
-
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index ffb7206b1e..cab60c3111 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -296,6 +296,7 @@ impl DeleteTenantFlow {
         remote_storage: Option<GenericRemoteStorage>,
         tenants: &'static std::sync::RwLock<TenantsMap>,
         tenant: Arc<Tenant>,
+        cancel: &CancellationToken,
     ) -> Result<(), DeleteTenantError> {
         span::debug_assert_current_span_has_tenant_id();
 
@@ -303,7 +304,9 @@ impl DeleteTenantFlow {
 
         let mut guard = Self::prepare(&tenant).await?;
 
-        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
+        if let Err(e) =
+            Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await
+        {
             tenant.set_broken(format!("{e:#}")).await;
             return Err(e);
         }
@@ -322,6 +325,7 @@ impl DeleteTenantFlow {
         conf: &'static PageServerConf,
         remote_storage: Option<&GenericRemoteStorage>,
         tenant: &Tenant,
+        cancel: &CancellationToken,
     ) -> Result<(), DeleteTenantError> {
         guard.mark_in_progress()?;
 
@@ -335,15 +339,9 @@ impl DeleteTenantFlow {
         // Though sounds scary, different mark name?
         // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
         if let Some(remote_storage) = &remote_storage {
-            create_remote_delete_mark(
-                conf,
-                remote_storage,
-                &tenant.tenant_shard_id,
-                // Can't use tenant.cancel, it's already shut down.  TODO: wire in an appropriate token
-                &CancellationToken::new(),
-            )
-            .await
-            .context("remote_mark")?
+            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
+                .await
+                .context("remote_mark")?
         }
 
         fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
@@ -546,8 +544,7 @@ impl DeleteTenantFlow {
             conf,
             remote_storage.as_ref(),
             &tenant.tenant_shard_id,
-            // Can't use tenant.cancel, it's already shut down.  TODO: wire in an appropriate token
-            &CancellationToken::new(),
+            &task_mgr::shutdown_token(),
         )
         .await?;
 
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 26fcce1f38..f456ca3006 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -102,7 +102,7 @@ pub(crate) enum TenantsMap {
     /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
     /// New tenants can be added using [`tenant_map_acquire_slot`].
     Open(BTreeMap<TenantShardId, TenantSlot>),
-    /// The pageserver has entered shutdown mode via [`shutdown_all_tenants`].
+    /// The pageserver has entered shutdown mode via [`TenantManager::shutdown`].
     /// Existing tenants are still accessible, but no new tenants can be created.
     ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
 }
@@ -261,6 +261,12 @@ pub struct TenantManager {
     // See https://github.com/neondatabase/neon/issues/5796
     tenants: &'static std::sync::RwLock<TenantsMap>,
     resources: TenantSharedResources,
+
+    // Long-running operations that happen outside of a [`Tenant`] lifetime should respect this token.
+    // This is for edge cases like tenant deletion.  In normal cases (within a Tenant lifetime),
+    // tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or
+    // when the tenant detaches.
+    cancel: CancellationToken,
 }
 
 fn emergency_generations(
@@ -620,13 +626,14 @@ pub async fn init_tenant_mgr(
         conf,
         tenants: &TENANTS,
         resources,
+        cancel: CancellationToken::new(),
     })
 }
 
 /// Wrapper for Tenant::spawn that checks invariants before running, and inserts
 /// a broken tenant in the map if Tenant::spawn fails.
 #[allow(clippy::too_many_arguments)]
-pub(crate) fn tenant_spawn(
+fn tenant_spawn(
     conf: &'static PageServerConf,
     tenant_shard_id: TenantShardId,
     tenant_path: &Utf8Path,
@@ -680,21 +687,6 @@ pub(crate) fn tenant_spawn(
     Ok(tenant)
 }
 
-///
-/// Shut down all tenants. This runs as part of pageserver shutdown.
-///
-/// NB: We leave the tenants in the map, so that they remain accessible through
-/// the management API until we shut it down. If we removed the shut-down tenants
-/// from the tenants map, the management API would return 404 for these tenants,
-/// because TenantsMap::get() now returns `None`.
-/// That could be easily misinterpreted by control plane, the consumer of the
-/// management API. For example, it could attach the tenant on a different pageserver.
-/// We would then be in split-brain once this pageserver restarts.
-#[instrument(skip_all)]
-pub(crate) async fn shutdown_all_tenants() {
-    shutdown_all_tenants0(&TENANTS).await
-}
-
 async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
     let mut join_set = JoinSet::new();
 
@@ -833,40 +825,6 @@ pub(crate) enum SetNewTenantConfigError {
     Other(anyhow::Error),
 }
 
-pub(crate) async fn set_new_tenant_config(
-    conf: &'static PageServerConf,
-    new_tenant_conf: TenantConfOpt,
-    tenant_id: TenantId,
-) -> Result<(), SetNewTenantConfigError> {
-    // Legacy API: does not support sharding
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
-    info!("configuring tenant {tenant_id}");
-    let tenant = get_tenant(tenant_shard_id, true)?;
-
-    if !tenant.tenant_shard_id().shard_count.is_unsharded() {
-        // Note that we use ShardParameters::default below.
-        return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
-            "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
-        )));
-    }
-
-    // This is a legacy API that only operates on attached tenants: the preferred
-    // API to use is the location_config/ endpoint, which lets the caller provide
-    // the full LocationConf.
-    let location_conf = LocationConf::attached_single(
-        new_tenant_conf.clone(),
-        tenant.generation,
-        &ShardParameters::default(),
-    );
-
-    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf)
-        .await
-        .map_err(SetNewTenantConfigError::Persist)?;
-    tenant.set_new_tenant_config(new_tenant_conf);
-    Ok(())
-}
-
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum UpsertLocationError {
     #[error("Bad config request: {0}")]
@@ -1428,6 +1386,7 @@ impl TenantManager {
             self.resources.remote_storage.clone(),
             &TENANTS,
             tenant,
+            &self.cancel,
         )
         .await;
 
@@ -1443,6 +1402,35 @@ impl TenantManager {
         new_shard_count: ShardCount,
         new_stripe_size: Option<ShardStripeSize>,
         ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<TenantShardId>> {
+        let r = self
+            .do_shard_split(tenant_shard_id, new_shard_count, new_stripe_size, ctx)
+            .await;
+        if r.is_err() {
+            // Shard splitting might have left the original shard in a partially shut down state (it
+            // stops the shard's remote timeline client).  Reset it to ensure we leave things in
+            // a working state.
+            if self.get(tenant_shard_id).is_some() {
+                tracing::warn!("Resetting {tenant_shard_id} after shard split failure");
+                if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await {
+                    // Log this error because our return value will still be the original error, not this one.  This is
+                    // a severe error: if this happens, we might be leaving behind a tenant that is not fully functional
+                    // (e.g. has uploads disabled).  We can't do anything else: if reset fails then shutting the tenant down or
+                    // setting it broken probably won't help either.
+                    tracing::error!("Failed to reset {tenant_shard_id}: {e}");
+                }
+            }
+        }
+
+        r
+    }
+
+    pub(crate) async fn do_shard_split(
+        &self,
+        tenant_shard_id: TenantShardId,
+        new_shard_count: ShardCount,
+        new_stripe_size: Option<ShardStripeSize>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Vec<TenantShardId>> {
         let tenant = get_tenant(tenant_shard_id, true)?;
 
@@ -1477,6 +1465,10 @@ impl TenantManager {
                 .join(",")
         );
 
+        fail::fail_point!("shard-split-pre-prepare", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
+
         let parent_shard_identity = tenant.shard_identity;
         let parent_tenant_conf = tenant.get_tenant_conf();
         let parent_generation = tenant.generation;
@@ -1490,6 +1482,10 @@ impl TenantManager {
             return Err(e);
         }
 
+        fail::fail_point!("shard-split-post-prepare", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
+
         self.resources.deletion_queue_client.flush_advisory();
 
         // Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
@@ -1511,11 +1507,16 @@ impl TenantManager {
                 anyhow::bail!("Detached parent shard in the middle of split!")
             }
         };
-
+        fail::fail_point!("shard-split-pre-hardlink", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
         // Optimization: hardlink layers from the parent into the children, so that they don't have to
         // re-download & duplicate the data referenced in their initial IndexPart
         self.shard_split_hardlink(parent, child_shards.clone())
             .await?;
+        fail::fail_point!("shard-split-post-hardlink", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
 
         // Take a snapshot of where the parent's WAL ingest had got to: we will wait for
         // child shards to reach this point.
@@ -1555,6 +1556,10 @@ impl TenantManager {
             .await?;
         }
 
+        fail::fail_point!("shard-split-post-child-conf", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
+
         // Phase 4: wait for child chards WAL ingest to catch up to target LSN
         for child_shard_id in &child_shards {
             let child_shard_id = *child_shard_id;
@@ -1587,6 +1592,10 @@ impl TenantManager {
                         timeline.timeline_id,
                         target_lsn
                     );
+
+                    fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
+                        "failpoint"
+                    )));
                     if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
                         // Failure here might mean shutdown, in any case this part is an optimization
                         // and we shouldn't hold up the split operation.
@@ -1618,19 +1627,11 @@ impl TenantManager {
         let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
             .await
             .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
-        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
-            TaskKind::MgmtRequest,
-            None,
-            None,
-            "tenant_files_delete",
-            false,
-            async move {
-                fs::remove_dir_all(tmp_path.as_path())
-                    .await
-                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
-            },
-        );
+        self.spawn_background_purge(tmp_path);
+
+        fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
 
         parent_slot_guard.drop_old_value()?;
 
@@ -1763,6 +1764,151 @@ impl TenantManager {
 
         Ok(())
     }
+
+    ///
+    /// Shut down all tenants. This runs as part of pageserver shutdown.
+    ///
+    /// NB: We leave the tenants in the map, so that they remain accessible through
+    /// the management API until we shut it down. If we removed the shut-down tenants
+    /// from the tenants map, the management API would return 404 for these tenants,
+    /// because TenantsMap::get() now returns `None`.
+    /// That could be easily misinterpreted by control plane, the consumer of the
+    /// management API. For example, it could attach the tenant on a different pageserver.
+    /// We would then be in split-brain once this pageserver restarts.
+    #[instrument(skip_all)]
+    pub(crate) async fn shutdown(&self) {
+        self.cancel.cancel();
+
+        shutdown_all_tenants0(self.tenants).await
+    }
+
+    /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
+    /// the background, and thereby avoid blocking any API requests on this deletion completing.
+    fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) {
+        // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
+        // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
+        let task_tenant_id = None;
+
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::MgmtRequest,
+            task_tenant_id,
+            None,
+            "tenant_files_delete",
+            false,
+            async move {
+                fs::remove_dir_all(tmp_path.as_path())
+                    .await
+                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
+            },
+        );
+    }
+
+    pub(crate) async fn detach_tenant(
+        &self,
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+        detach_ignored: bool,
+        deletion_queue_client: &DeletionQueueClient,
+    ) -> Result<(), TenantStateError> {
+        let tmp_path = self
+            .detach_tenant0(
+                conf,
+                &TENANTS,
+                tenant_shard_id,
+                detach_ignored,
+                deletion_queue_client,
+            )
+            .await?;
+        self.spawn_background_purge(tmp_path);
+
+        Ok(())
+    }
+
+    async fn detach_tenant0(
+        &self,
+        conf: &'static PageServerConf,
+        tenants: &std::sync::RwLock<TenantsMap>,
+        tenant_shard_id: TenantShardId,
+        detach_ignored: bool,
+        deletion_queue_client: &DeletionQueueClient,
+    ) -> Result<Utf8PathBuf, TenantStateError> {
+        let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
+            let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
+            safe_rename_tenant_dir(&local_tenant_directory)
+                .await
+                .with_context(|| {
+                    format!("local tenant directory {local_tenant_directory:?} rename")
+                })
+        };
+
+        let removal_result = remove_tenant_from_memory(
+            tenants,
+            tenant_shard_id,
+            tenant_dir_rename_operation(tenant_shard_id),
+        )
+        .await;
+
+        // Flush pending deletions, so that they have a good chance of passing validation
+        // before this tenant is potentially re-attached elsewhere.
+        deletion_queue_client.flush_advisory();
+
+        // Ignored tenants are not present in memory and will bail the removal from memory operation.
+        // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
+        if detach_ignored
+            && matches!(
+                removal_result,
+                Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
+            )
+        {
+            let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
+            if tenant_ignore_mark.exists() {
+                info!("Detaching an ignored tenant");
+                let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
+                    .await
+                    .with_context(|| {
+                        format!("Ignored tenant {tenant_shard_id} local directory rename")
+                    })?;
+                return Ok(tmp_path);
+            }
+        }
+
+        removal_result
+    }
+
+    pub(crate) async fn set_new_tenant_config(
+        &self,
+        new_tenant_conf: TenantConfOpt,
+        tenant_id: TenantId,
+    ) -> Result<(), SetNewTenantConfigError> {
+        // Legacy API: does not support sharding
+        let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+        info!("configuring tenant {tenant_id}");
+        let tenant = get_tenant(tenant_shard_id, true)?;
+
+        if !tenant.tenant_shard_id().shard_count.is_unsharded() {
+            // Note that we use ShardParameters::default below.
+            return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
+            "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
+        )));
+        }
+
+        // This is a legacy API that only operates on attached tenants: the preferred
+        // API to use is the location_config/ endpoint, which lets the caller provide
+        // the full LocationConf.
+        let location_conf = LocationConf::attached_single(
+            new_tenant_conf.clone(),
+            tenant.generation,
+            &ShardParameters::default(),
+        );
+
+        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &location_conf)
+            .await
+            .map_err(SetNewTenantConfigError::Persist)?;
+        tenant.set_new_tenant_config(new_tenant_conf);
+        Ok(())
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -1964,87 +2110,6 @@ pub(crate) enum TenantStateError {
     Other(#[from] anyhow::Error),
 }
 
-pub(crate) async fn detach_tenant(
-    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
-    detach_ignored: bool,
-    deletion_queue_client: &DeletionQueueClient,
-) -> Result<(), TenantStateError> {
-    let tmp_path = detach_tenant0(
-        conf,
-        &TENANTS,
-        tenant_shard_id,
-        detach_ignored,
-        deletion_queue_client,
-    )
-    .await?;
-    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
-    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
-    let task_tenant_id = None;
-    task_mgr::spawn(
-        task_mgr::BACKGROUND_RUNTIME.handle(),
-        TaskKind::MgmtRequest,
-        task_tenant_id,
-        None,
-        "tenant_files_delete",
-        false,
-        async move {
-            fs::remove_dir_all(tmp_path.as_path())
-                .await
-                .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
-        },
-    );
-    Ok(())
-}
-
-async fn detach_tenant0(
-    conf: &'static PageServerConf,
-    tenants: &std::sync::RwLock<TenantsMap>,
-    tenant_shard_id: TenantShardId,
-    detach_ignored: bool,
-    deletion_queue_client: &DeletionQueueClient,
-) -> Result<Utf8PathBuf, TenantStateError> {
-    let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
-        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
-        safe_rename_tenant_dir(&local_tenant_directory)
-            .await
-            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))
-    };
-
-    let removal_result = remove_tenant_from_memory(
-        tenants,
-        tenant_shard_id,
-        tenant_dir_rename_operation(tenant_shard_id),
-    )
-    .await;
-
-    // Flush pending deletions, so that they have a good chance of passing validation
-    // before this tenant is potentially re-attached elsewhere.
-    deletion_queue_client.flush_advisory();
-
-    // Ignored tenants are not present in memory and will bail the removal from memory operation.
-    // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
-    if detach_ignored
-        && matches!(
-            removal_result,
-            Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
-        )
-    {
-        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
-        if tenant_ignore_mark.exists() {
-            info!("Detaching an ignored tenant");
-            let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
-                .await
-                .with_context(|| {
-                    format!("Ignored tenant {tenant_shard_id} local directory rename")
-                })?;
-            return Ok(tmp_path);
-        }
-    }
-
-    removal_result
-}
-
 pub(crate) async fn load_tenant(
     conf: &'static PageServerConf,
     tenant_id: TenantId,
@@ -2665,7 +2730,7 @@ use {
     utils::http::error::ApiError,
 };
 
-pub(crate) async fn immediate_gc(
+pub(crate) fn immediate_gc(
     tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
     gc_req: TimelineGcRequest,
@@ -2687,6 +2752,8 @@ pub(crate) async fn immediate_gc(
     // Run in task_mgr to avoid race with tenant_detach operation
     let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
     let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
+    let span = info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
+
     // TODO: spawning is redundant now, need to hold the gate
     task_mgr::spawn(
         &tokio::runtime::Handle::current(),
@@ -2701,16 +2768,15 @@ pub(crate) async fn immediate_gc(
             #[allow(unused_mut)]
             let mut result = tenant
                 .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
-                .instrument(info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))
                 .await;
                 // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
                 // better once the types support it.
 
             #[cfg(feature = "testing")]
             {
+                // we need to synchronize with drop completion for python tests without polling for
+                // log messages
                 if let Ok(result) = result.as_mut() {
-                    // why not futures unordered? it seems it needs very much the same task structure
-                    // but would only run on single task.
                     let mut js = tokio::task::JoinSet::new();
                     for layer in std::mem::take(&mut result.doomed_layers) {
                         js.spawn(layer.wait_drop());
@@ -2726,7 +2792,7 @@ pub(crate) async fn immediate_gc(
 
                 if let Some(rtc) = rtc {
                     // layer drops schedule actions on remote timeline client to actually do the
-                    // deletions; don't care just exit fast about the shutdown error
+                    // deletions; don't care about the shutdown error, just exit fast
                     drop(rtc.wait_completion().await);
                 }
             }
@@ -2737,6 +2803,7 @@ pub(crate) async fn immediate_gc(
             }
             Ok(())
         }
+        .instrument(span)
     );
 
     // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 6fff6e78e2..6ee8ad7155 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -23,7 +23,7 @@ use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::Generation;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
-use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
+use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::TimelineId;
 
@@ -73,55 +73,13 @@ pub async fn download_layer_file<'a>(
     // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
     let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
 
-    let (mut destination_file, bytes_amount) = download_retry(
-        || async {
-            let destination_file = tokio::fs::File::create(&temp_file_path)
-                .await
-                .with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
-                .map_err(DownloadError::Other)?;
-
-            let download = storage.download(&remote_path, cancel).await?;
-
-            let mut destination_file =
-                tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
-
-            let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
-
-            let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file).await;
-
-            match bytes_amount {
-                Ok(bytes_amount) => {
-                    let destination_file = destination_file.into_inner();
-                    Ok((destination_file, bytes_amount))
-                }
-                Err(e) => {
-                    if let Err(e) = tokio::fs::remove_file(&temp_file_path).await {
-                        on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
-                    }
-
-                    Err(e.into())
-                }
-            }
-        },
+    let bytes_amount = download_retry(
+        || async { download_object(storage, &remote_path, &temp_file_path, cancel).await },
         &format!("download {remote_path:?}"),
         cancel,
     )
     .await?;
 
-    // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
-    // A file will not be closed immediately when it goes out of scope if there are any IO operations
-    // that have not yet completed. To ensure that a file is closed immediately when it is dropped,
-    // you should call flush before dropping it.
-    //
-    // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
-    // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
-    // But for additional safety lets check/wait for any pending operations.
-    destination_file
-        .flush()
-        .await
-        .with_context(|| format!("flush source file at {temp_file_path}"))
-        .map_err(DownloadError::Other)?;
-
     let expected = layer_metadata.file_size();
     if expected != bytes_amount {
         return Err(DownloadError::Other(anyhow!(
@@ -129,14 +87,6 @@ pub async fn download_layer_file<'a>(
         )));
     }
 
-    // not using sync_data because it can lose file size update
-    destination_file
-        .sync_all()
-        .await
-        .with_context(|| format!("failed to fsync source file at {temp_file_path}"))
-        .map_err(DownloadError::Other)?;
-    drop(destination_file);
-
     fail::fail_point!("remote-storage-download-pre-rename", |_| {
         Err(DownloadError::Other(anyhow!(
             "remote-storage-download-pre-rename failpoint triggered"
@@ -169,6 +119,128 @@ pub async fn download_layer_file<'a>(
     Ok(bytes_amount)
 }
 
+/// Download the object `src_path` in the remote `storage` to local path `dst_path`.
+///
+/// If Ok() is returned, the download succeeded and the inode & data have been made durable.
+/// (Note that the directory entry for the inode is not made durable.)
+/// The file size in bytes is returned.
+///
+/// If Err() is returned, there was some error. The file at `dst_path` has been unlinked.
+/// The unlinking has _not_ been made durable.
+async fn download_object<'a>(
+    storage: &'a GenericRemoteStorage,
+    src_path: &RemotePath,
+    dst_path: &Utf8PathBuf,
+    cancel: &CancellationToken,
+) -> Result<u64, DownloadError> {
+    let res = match crate::virtual_file::io_engine::get() {
+        crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"),
+        crate::virtual_file::io_engine::IoEngine::StdFs => {
+            async {
+                let destination_file = tokio::fs::File::create(dst_path)
+                    .await
+                    .with_context(|| format!("create a destination file for layer '{dst_path}'"))
+                    .map_err(DownloadError::Other)?;
+
+                let download = storage.download(src_path, cancel).await?;
+
+                let mut buf_writer =
+                    tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
+
+                let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
+
+                let bytes_amount = tokio::io::copy_buf(&mut reader, &mut buf_writer).await?;
+                buf_writer.flush().await?;
+
+                let mut destination_file = buf_writer.into_inner();
+
+                // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
+                // A file will not be closed immediately when it goes out of scope if there are any IO operations
+                // that have not yet completed. To ensure that a file is closed immediately when it is dropped,
+                // you should call flush before dropping it.
+                //
+                // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
+                // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
+                // But for additional safety lets check/wait for any pending operations.
+                destination_file
+                    .flush()
+                    .await
+                    .with_context(|| format!("flush source file at {dst_path}"))
+                    .map_err(DownloadError::Other)?;
+
+                // not using sync_data because it can lose file size update
+                destination_file
+                    .sync_all()
+                    .await
+                    .with_context(|| format!("failed to fsync source file at {dst_path}"))
+                    .map_err(DownloadError::Other)?;
+
+                Ok(bytes_amount)
+            }
+            .await
+        }
+        #[cfg(target_os = "linux")]
+        crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
+            use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
+            async {
+                let destination_file = VirtualFile::create(dst_path)
+                    .await
+                    .with_context(|| format!("create a destination file for layer '{dst_path}'"))
+                    .map_err(DownloadError::Other)?;
+
+                let mut download = storage.download(src_path, cancel).await?;
+
+                // TODO: use vectored write (writev) once supported by tokio-epoll-uring.
+                // There's chunks_vectored() on the stream.
+                let (bytes_amount, destination_file) = async {
+                    let size_tracking = size_tracking_writer::Writer::new(destination_file);
+                    let mut buffered = owned_buffers_io::write::BufferedWriter::<
+                        { super::BUFFER_SIZE },
+                        _,
+                    >::new(size_tracking);
+                    while let Some(res) =
+                        futures::StreamExt::next(&mut download.download_stream).await
+                    {
+                        let chunk = match res {
+                            Ok(chunk) => chunk,
+                            Err(e) => return Err(e),
+                        };
+                        buffered
+                            .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk))
+                            .await?;
+                    }
+                    let size_tracking = buffered.flush_and_into_inner().await?;
+                    Ok(size_tracking.into_inner())
+                }
+                .await?;
+
+                // not using sync_data because it can lose file size update
+                destination_file
+                    .sync_all()
+                    .await
+                    .with_context(|| format!("failed to fsync source file at {dst_path}"))
+                    .map_err(DownloadError::Other)?;
+
+                Ok(bytes_amount)
+            }
+            .await
+        }
+    };
+
+    // in case the download failed, clean up
+    match res {
+        Ok(bytes_amount) => Ok(bytes_amount),
+        Err(e) => {
+            if let Err(e) = tokio::fs::remove_file(dst_path).await {
+                if e.kind() != std::io::ErrorKind::NotFound {
+                    on_fatal_io_error(&e, &format!("Removing temporary file {dst_path}"));
+                }
+            }
+            Err(e)
+        }
+    }
+}
+
 const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
 
 pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 14e88b836e..19f36c722e 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -95,7 +95,11 @@ pub(crate) struct SecondaryTenant {
     shard_identity: ShardIdentity,
     tenant_conf: std::sync::Mutex<TenantConfOpt>,
 
+    // Internal state used by the Downloader.
     detail: std::sync::Mutex<SecondaryDetail>,
+
+    // Public state indicating overall progress of downloads relative to the last heatmap seen
+    pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
 }
 
 impl SecondaryTenant {
@@ -118,6 +122,8 @@ impl SecondaryTenant {
             tenant_conf: std::sync::Mutex::new(tenant_conf),
 
             detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
+
+            progress: std::sync::Mutex::default(),
         })
     }
 
@@ -247,9 +253,12 @@ impl SecondaryTenant {
 }
 
 /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
-/// and heatmap uploads.  This is not a hot data path: it's primarily a hook for tests,
-/// where we want to immediately upload/download for a particular tenant.  In normal operation
-/// uploads & downloads are autonomous and not driven by this interface.
+/// and heatmap uploads.  This is not a hot data path: it's used for:
+/// - Live migrations, where we want to ensure a migration destination has the freshest possible
+///   content before trying to cut over.
+/// - Tests, where we want to immediately upload/download for a particular tenant.
+///
+/// In normal operations, outside of migrations, uploads & downloads are autonomous and not driven by this interface.
 pub struct SecondaryController {
     upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
     download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index b679077358..a595096133 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -41,14 +41,16 @@ use crate::tenant::{
 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
 use futures::Future;
+use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
 use rand::Rng;
-use remote_storage::{DownloadError, GenericRemoteStorage};
+use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
 
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
 use utils::{
-    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
+    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
+    id::TimelineId,
 };
 
 use super::{
@@ -128,6 +130,7 @@ pub(super) struct SecondaryDetail {
     pub(super) config: SecondaryLocationConfig,
 
     last_download: Option<Instant>,
+    last_etag: Option<Etag>,
     next_download: Option<Instant>,
     pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
 }
@@ -138,11 +141,26 @@ fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
     datetime.format("%d/%m/%Y %T")
 }
 
+/// Information returned from download function when it detects the heatmap has changed
+struct HeatMapModified {
+    etag: Etag,
+    last_modified: SystemTime,
+    bytes: Vec<u8>,
+}
+
+enum HeatMapDownload {
+    // The heatmap's etag has changed: return the new etag, mtime and the body bytes
+    Modified(HeatMapModified),
+    // The heatmap's etag is unchanged
+    Unmodified,
+}
+
 impl SecondaryDetail {
     pub(super) fn new(config: SecondaryLocationConfig) -> Self {
         Self {
             config,
             last_download: None,
+            last_etag: None,
             next_download: None,
             timelines: HashMap::new(),
         }
@@ -477,11 +495,31 @@ impl<'a> TenantDownloader<'a> {
         };
 
         let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+
+        // We will use the etag from last successful download to make the download conditional on changes
+        let last_etag = self
+            .secondary_state
+            .detail
+            .lock()
+            .unwrap()
+            .last_etag
+            .clone();
+
         // Download the tenant's heatmap
-        let heatmap_bytes = tokio::select!(
-            bytes = self.download_heatmap() => {bytes?},
+        let HeatMapModified {
+            last_modified: heatmap_mtime,
+            etag: heatmap_etag,
+            bytes: heatmap_bytes,
+        } = match tokio::select!(
+            bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?},
             _ = self.secondary_state.cancel.cancelled() => return Ok(())
-        );
+        ) {
+            HeatMapDownload::Unmodified => {
+                tracing::info!("Heatmap unchanged since last successful download");
+                return Ok(());
+            }
+            HeatMapDownload::Modified(m) => m,
+        };
 
         let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
 
@@ -498,6 +536,14 @@ impl<'a> TenantDownloader<'a> {
 
         tracing::debug!("Wrote local heatmap to {}", heatmap_path);
 
+        // Clean up any local layers that aren't in the heatmap.  We do this first for all timelines, on the general
+        // principle that deletions should be done before writes wherever possible, and so that we can use this
+        // phase to initialize our SecondaryProgress.
+        {
+            *self.secondary_state.progress.lock().unwrap() =
+                self.prepare_timelines(&heatmap, heatmap_mtime).await?;
+        }
+
         // Download the layers in the heatmap
         for timeline in heatmap.timelines {
             if self.secondary_state.cancel.is_cancelled() {
@@ -515,30 +561,159 @@ impl<'a> TenantDownloader<'a> {
                 .await?;
         }
 
+        // Only update last_etag after a full successful download: this way will not skip
+        // the next download, even if the heatmap's actual etag is unchanged.
+        self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag);
+
         Ok(())
     }
 
-    async fn download_heatmap(&self) -> Result<Vec<u8>, UpdateError> {
+    /// Do any fast local cleanup that comes before the much slower process of downloading
+    /// layers from remote storage.  In the process, initialize the SecondaryProgress object
+    /// that will later be updated incrementally as we download layers.
+    async fn prepare_timelines(
+        &self,
+        heatmap: &HeatMapTenant,
+        heatmap_mtime: SystemTime,
+    ) -> Result<SecondaryProgress, UpdateError> {
+        let heatmap_stats = heatmap.get_stats();
+        // We will construct a progress object, and then populate its initial "downloaded" numbers
+        // while iterating through local layer state in [`Self::prepare_timelines`]
+        let mut progress = SecondaryProgress {
+            layers_total: heatmap_stats.layers,
+            bytes_total: heatmap_stats.bytes,
+            heatmap_mtime: Some(heatmap_mtime),
+            layers_downloaded: 0,
+            bytes_downloaded: 0,
+        };
+        // Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
+        let mut delete_layers = Vec::new();
+        let mut delete_timelines = Vec::new();
+        {
+            let mut detail = self.secondary_state.detail.lock().unwrap();
+            for (timeline_id, timeline_state) in &mut detail.timelines {
+                let Some(heatmap_timeline_index) = heatmap
+                    .timelines
+                    .iter()
+                    .position(|t| t.timeline_id == *timeline_id)
+                else {
+                    // This timeline is no longer referenced in the heatmap: delete it locally
+                    delete_timelines.push(*timeline_id);
+                    continue;
+                };
+
+                let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap();
+
+                let layers_in_heatmap = heatmap_timeline
+                    .layers
+                    .iter()
+                    .map(|l| &l.name)
+                    .collect::<HashSet<_>>();
+                let layers_on_disk = timeline_state
+                    .on_disk_layers
+                    .iter()
+                    .map(|l| l.0)
+                    .collect::<HashSet<_>>();
+
+                let mut layer_count = layers_on_disk.len();
+                let mut layer_byte_count: u64 = timeline_state
+                    .on_disk_layers
+                    .values()
+                    .map(|l| l.metadata.file_size())
+                    .sum();
+
+                // Remove on-disk layers that are no longer present in heatmap
+                for layer in layers_on_disk.difference(&layers_in_heatmap) {
+                    layer_count -= 1;
+                    layer_byte_count -= timeline_state
+                        .on_disk_layers
+                        .get(layer)
+                        .unwrap()
+                        .metadata
+                        .file_size();
+
+                    delete_layers.push((*timeline_id, (*layer).clone()));
+                }
+
+                progress.bytes_downloaded += layer_byte_count;
+                progress.layers_downloaded += layer_count;
+            }
+        }
+
+        // Execute accumulated deletions
+        for (timeline_id, layer_name) in delete_layers {
+            let timeline_path = self
+                .conf
+                .timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
+            let local_path = timeline_path.join(layer_name.to_string());
+            tracing::info!(timeline_id=%timeline_id, "Removing secondary local layer {layer_name} because it's absent in heatmap",);
+
+            tokio::fs::remove_file(&local_path)
+                .await
+                .or_else(fs_ext::ignore_not_found)
+                .maybe_fatal_err("Removing secondary layer")?;
+
+            // Update in-memory housekeeping to reflect the absence of the deleted layer
+            let mut detail = self.secondary_state.detail.lock().unwrap();
+            let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
+                continue;
+            };
+            timeline_state.on_disk_layers.remove(&layer_name);
+        }
+
+        for timeline_id in delete_timelines {
+            let timeline_path = self
+                .conf
+                .timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
+            tracing::info!(timeline_id=%timeline_id,
+                "Timeline no longer in heatmap, removing from secondary location"
+            );
+            tokio::fs::remove_dir_all(&timeline_path)
+                .await
+                .or_else(fs_ext::ignore_not_found)
+                .maybe_fatal_err("Removing secondary timeline")?;
+        }
+
+        Ok(progress)
+    }
+
+    /// Returns downloaded bytes if the etag differs from `prev_etag`, or None if the object
+    /// still matches `prev_etag`.
+    async fn download_heatmap(
+        &self,
+        prev_etag: Option<&Etag>,
+    ) -> Result<HeatMapDownload, UpdateError> {
         debug_assert_current_span_has_tenant_id();
         let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
-        // TODO: make download conditional on ETag having changed since last download
+        // TODO: pull up etag check into the request, to do a conditional GET rather than
+        // issuing a GET and then maybe ignoring the response body
         // (https://github.com/neondatabase/neon/issues/6199)
         tracing::debug!("Downloading heatmap for secondary tenant",);
 
         let heatmap_path = remote_heatmap_path(tenant_shard_id);
         let cancel = &self.secondary_state.cancel;
 
-        let heatmap_bytes = backoff::retry(
+        backoff::retry(
             || async {
                 let download = self
                     .remote_storage
                     .download(&heatmap_path, cancel)
                     .await
                     .map_err(UpdateError::from)?;
-                let mut heatmap_bytes = Vec::new();
-                let mut body = tokio_util::io::StreamReader::new(download.download_stream);
-                let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
-                Ok(heatmap_bytes)
+
+                if Some(&download.etag) == prev_etag {
+                    Ok(HeatMapDownload::Unmodified)
+                } else {
+                    let mut heatmap_bytes = Vec::new();
+                    let mut body = tokio_util::io::StreamReader::new(download.download_stream);
+                    let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
+                    SECONDARY_MODE.download_heatmap.inc();
+                    Ok(HeatMapDownload::Modified(HeatMapModified {
+                        etag: download.etag,
+                        last_modified: download.last_modified,
+                        bytes: heatmap_bytes,
+                    }))
+                }
             },
             |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
             FAILED_DOWNLOAD_WARN_THRESHOLD,
@@ -548,11 +723,7 @@ impl<'a> TenantDownloader<'a> {
         )
         .await
         .ok_or_else(|| UpdateError::Cancelled)
-        .and_then(|x| x)?;
-
-        SECONDARY_MODE.download_heatmap.inc();
-
-        Ok(heatmap_bytes)
+        .and_then(|x| x)
     }
 
     async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
@@ -593,27 +764,6 @@ impl<'a> TenantDownloader<'a> {
             }
         };
 
-        let layers_in_heatmap = timeline
-            .layers
-            .iter()
-            .map(|l| &l.name)
-            .collect::<HashSet<_>>();
-        let layers_on_disk = timeline_state
-            .on_disk_layers
-            .iter()
-            .map(|l| l.0)
-            .collect::<HashSet<_>>();
-
-        // Remove on-disk layers that are no longer present in heatmap
-        for layer in layers_on_disk.difference(&layers_in_heatmap) {
-            let local_path = timeline_path.join(layer.to_string());
-            tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
-            tokio::fs::remove_file(&local_path)
-                .await
-                .or_else(fs_ext::ignore_not_found)
-                .maybe_fatal_err("Removing secondary layer")?;
-        }
-
         // Download heatmap layers that are not present on local disk, or update their
         // access time if they are already present.
         for layer in timeline.layers {
@@ -662,6 +812,12 @@ impl<'a> TenantDownloader<'a> {
                 }
             }
 
+            // Failpoint for simulating slow remote storage
+            failpoint_support::sleep_millis_async!(
+                "secondary-layer-download-sleep",
+                &self.secondary_state.cancel
+            );
+
             // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
             let downloaded_bytes = match download_layer_file(
                 self.conf,
@@ -701,6 +857,11 @@ impl<'a> TenantDownloader<'a> {
                 tokio::fs::remove_file(&local_path)
                     .await
                     .or_else(fs_ext::ignore_not_found)?;
+            } else {
+                tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
+                let mut progress = self.secondary_state.progress.lock().unwrap();
+                progress.bytes_downloaded += downloaded_bytes;
+                progress.layers_downloaded += 1;
             }
 
             SECONDARY_MODE.download_layer.inc();
diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs
index 99aaaeb8c8..73cdf6c6d4 100644
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -62,3 +62,25 @@ impl HeatMapTimeline {
         }
     }
 }
+
+pub(crate) struct HeatMapStats {
+    pub(crate) bytes: u64,
+    pub(crate) layers: usize,
+}
+
+impl HeatMapTenant {
+    pub(crate) fn get_stats(&self) -> HeatMapStats {
+        let mut stats = HeatMapStats {
+            bytes: 0,
+            layers: 0,
+        };
+        for timeline in &self.timelines {
+            for layer in &timeline.layers {
+                stats.layers += 1;
+                stats.bytes += layer.metadata.file_size;
+            }
+        }
+
+        stats
+    }
+}
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index e0b1652d98..ad79b74d8b 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -183,7 +183,13 @@ pub(super) async fn gather_inputs(
         // new gc run, which we have no control over. however differently from `Timeline::gc`
         // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
         // actually removing files.
-        let mut next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);
+        //
+        // We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from
+        // a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather
+        // than a space bound (horizon cutoff).  This means that if someone drops a database and waits for their
+        // PITR interval, they will see synthetic size decrease, even if we are still storing data inside
+        // horizon_cutoff.
+        let mut next_gc_cutoff = gc_info.pitr_cutoff;
 
         // If the caller provided a shorter retention period, use that instead of the GC cutoff.
         let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 299950cc21..5c3bab9868 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -20,6 +20,7 @@ use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::models::{
     LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
 };
+use std::borrow::Cow;
 use std::cmp::{Ordering, Reverse};
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
@@ -427,7 +428,7 @@ impl LayerAccessStatFullDetails {
         } = self;
         pageserver_api::models::LayerAccessStatFullDetails {
             when_millis_since_epoch: system_time_to_millis_since_epoch(when),
-            task_kind: task_kind.into(), // into static str, powered by strum_macros
+            task_kind: Cow::Borrowed(task_kind.into()), // into static str, powered by strum_macros
             access_kind: *access_kind,
         }
     }
@@ -525,7 +526,7 @@ impl LayerAccessStats {
                 .collect(),
             task_kind_access_flag: task_kind_flag
                 .iter()
-                .map(|task_kind| task_kind.into()) // into static str, powered by strum_macros
+                .map(|task_kind| Cow::Borrowed(task_kind.into())) // into static str, powered by strum_macros
                 .collect(),
             first: first_access.as_ref().map(|a| a.as_api_model()),
             accesses_history: last_accesses.map(|m| m.as_api_model()),
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 959065bc4c..f37d7e6449 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -710,10 +710,6 @@ impl LayerInner {
                     // disable any scheduled but not yet running eviction deletions for this
                     let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
 
-                    // count cancellations, which currently remain largely unexpected
-                    let init_cancelled =
-                        scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
-
                     // no need to make the evict_and_wait wait for the actual download to complete
                     drop(self.status.send(Status::Downloaded));
 
@@ -722,7 +718,9 @@ impl LayerInner {
                         .upgrade()
                         .ok_or_else(|| DownloadError::TimelineShutdown)?;
 
-                    // FIXME: grab a gate
+                    // count cancellations, which currently remain largely unexpected
+                    let init_cancelled =
+                        scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
 
                     let can_ever_evict = timeline.remote_client.as_ref().is_some();
 
@@ -731,9 +729,17 @@ impl LayerInner {
                     let needs_download = self
                         .needs_download()
                         .await
-                        .map_err(DownloadError::PreStatFailed)?;
+                        .map_err(DownloadError::PreStatFailed);
 
-                    let permit = if let Some(reason) = needs_download {
+                    let needs_download = match needs_download {
+                        Ok(reason) => reason,
+                        Err(e) => {
+                            scopeguard::ScopeGuard::into_inner(init_cancelled);
+                            return Err(e);
+                        }
+                    };
+
+                    let (permit, downloaded) = if let Some(reason) = needs_download {
                         if let NeedsDownload::NotFile(ft) = reason {
                             return Err(DownloadError::NotFile(ft));
                         }
@@ -744,36 +750,59 @@ impl LayerInner {
                         self.wanted_evicted.store(false, Ordering::Release);
 
                         if !can_ever_evict {
+                            scopeguard::ScopeGuard::into_inner(init_cancelled);
                             return Err(DownloadError::NoRemoteStorage);
                         }
 
                         if let Some(ctx) = ctx {
-                            self.check_expected_download(ctx)?;
+                            let res = self.check_expected_download(ctx);
+                            if let Err(e) = res {
+                                scopeguard::ScopeGuard::into_inner(init_cancelled);
+                                return Err(e);
+                            }
                         }
 
                         if !allow_download {
                             // this does look weird, but for LayerInner the "downloading" means also changing
                             // internal once related state ...
+                            scopeguard::ScopeGuard::into_inner(init_cancelled);
                             return Err(DownloadError::DownloadRequired);
                         }
 
                         tracing::info!(%reason, "downloading on-demand");
 
-                        self.spawn_download_and_wait(timeline, permit).await?
+                        let permit = self.spawn_download_and_wait(timeline, permit).await;
+
+                        let permit = match permit {
+                            Ok(permit) => permit,
+                            Err(e) => {
+                                scopeguard::ScopeGuard::into_inner(init_cancelled);
+                                return Err(e);
+                            }
+                        };
+
+                        (permit, true)
                     } else {
                         // the file is present locally, probably by a previous but cancelled call to
                         // get_or_maybe_download. alternatively we might be running without remote storage.
                         LAYER_IMPL_METRICS.inc_init_needed_no_download();
 
-                        permit
+                        (permit, false)
                     };
 
-                    let since_last_eviction =
-                        self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
-                    if let Some(since_last_eviction) = since_last_eviction {
-                        // FIXME: this will not always be recorded correctly until #6028 (the no
-                        // download needed branch above)
-                        LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
+                    scopeguard::ScopeGuard::into_inner(init_cancelled);
+
+                    if downloaded {
+                        let since_last_eviction = self
+                            .last_evicted_at
+                            .lock()
+                            .unwrap()
+                            .take()
+                            .map(|ts| ts.elapsed());
+
+                        if let Some(since_last_eviction) = since_last_eviction {
+                            LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
+                        }
                     }
 
                     let res = Arc::new(DownloadedLayer {
@@ -795,8 +824,6 @@ impl LayerInner {
                         );
                     }
 
-                    scopeguard::ScopeGuard::into_inner(init_cancelled);
-
                     Ok((ResidentOrWantedEvicted::Resident(res), permit))
                 }
                 .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
@@ -1457,7 +1484,7 @@ impl ResidentLayer {
     }
 
     /// Loads all keys stored in the layer. Returns key, lsn and value size.
-    #[tracing::instrument(skip_all, fields(layer=%self))]
+    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
     pub(crate) async fn load_keys<'a>(
         &'a self,
         ctx: &RequestContext,
@@ -1477,9 +1504,9 @@ impl ResidentLayer {
                 // while it's being held.
                 delta_layer::DeltaLayerInner::load_keys(d, ctx)
                     .await
-                    .context("Layer index is corrupted")
+                    .with_context(|| format!("Layer index is corrupted for {self}"))
             }
-            Image(_) => anyhow::bail!("cannot load_keys on a image layer"),
+            Image(_) => anyhow::bail!(format!("cannot load_keys on a image layer {self}")),
         }
     }
 
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index 280773e9c3..f3f3d5e3ae 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -130,10 +130,10 @@ where
         self.inner.load().config.steady_rps()
     }
 
-    pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) {
+    pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) -> Option<Duration> {
         let inner = self.inner.load_full(); // clones the `Inner` Arc
         if !inner.task_kinds.contains(ctx.task_kind()) {
-            return;
+            return None;
         };
         let start = std::time::Instant::now();
         let mut did_throttle = false;
@@ -170,6 +170,9 @@ where
                     });
                 }
             }
+            Some(wait_time)
+        } else {
+            None
         }
     }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a733a3b1a7..2ab7301cce 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -634,6 +634,8 @@ impl Timeline {
     /// If a remote layer file is needed, it is downloaded as part of this
     /// call.
     ///
+    /// This method enforces [`Self::timeline_get_throttle`] internally.
+    ///
     /// NOTE: It is considered an error to 'get' a key that doesn't exist. The
     /// abstraction above this needs to store suitable metadata to track what
     /// data exists with what keys, in separate metadata entries. If a
@@ -644,18 +646,27 @@ impl Timeline {
     /// # Cancel-Safety
     ///
     /// This method is cancellation-safe.
+    #[inline(always)]
     pub(crate) async fn get(
         &self,
         key: Key,
         lsn: Lsn,
         ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        self.timeline_get_throttle.throttle(ctx, 1).await;
+        self.get_impl(key, lsn, ctx).await
+    }
+    /// Not subject to [`Self::timeline_get_throttle`].
+    async fn get_impl(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         if !lsn.is_valid() {
             return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
         }
 
-        self.timeline_get_throttle.throttle(ctx, 1).await;
-
         // This check is debug-only because of the cost of hashing, and because it's a double-check: we
         // already checked the key against the shard_identity when looking up the Timeline from
         // page_service.
@@ -752,10 +763,6 @@ impl Timeline {
             return Err(GetVectoredError::Oversized(key_count));
         }
 
-        self.timeline_get_throttle
-            .throttle(ctx, key_count as usize)
-            .await;
-
         for range in &keyspace.ranges {
             let mut key = range.start;
             while key != range.end {
@@ -772,11 +779,18 @@ impl Timeline {
             self.conf.get_vectored_impl
         );
 
-        let _timer = crate::metrics::GET_VECTORED_LATENCY
+        let start = crate::metrics::GET_VECTORED_LATENCY
             .for_task_kind(ctx.task_kind())
-            .map(|t| t.start_timer());
+            .map(|metric| (metric, Instant::now()));
 
-        match self.conf.get_vectored_impl {
+        // start counting after throttle so that throttle time
+        // is always less than observation time
+        let throttled = self
+            .timeline_get_throttle
+            .throttle(ctx, key_count as usize)
+            .await;
+
+        let res = match self.conf.get_vectored_impl {
             GetVectoredImpl::Sequential => {
                 self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
             }
@@ -790,9 +804,33 @@ impl Timeline {
 
                 vectored_res
             }
+        };
+
+        if let Some((metric, start)) = start {
+            let elapsed = start.elapsed();
+            let ex_throttled = if let Some(throttled) = throttled {
+                elapsed.checked_sub(throttled)
+            } else {
+                Some(elapsed)
+            };
+
+            if let Some(ex_throttled) = ex_throttled {
+                metric.observe(ex_throttled.as_secs_f64());
+            } else {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<RateLimit>> =
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                let mut rate_limit = LOGGED.lock().unwrap();
+                rate_limit.call(|| {
+                    warn!("error deducting time spent throttled; this message is logged at a global rate limit");
+                });
+            }
         }
+
+        res
     }
 
+    /// Not subject to [`Self::timeline_get_throttle`].
     pub(super) async fn get_vectored_sequential_impl(
         &self,
         keyspace: KeySpace,
@@ -803,7 +841,7 @@ impl Timeline {
         for range in keyspace.ranges {
             let mut key = range.start;
             while key != range.end {
-                let block = self.get(key, lsn, ctx).await;
+                let block = self.get_impl(key, lsn, ctx).await;
 
                 use PageReconstructError::*;
                 match block {
@@ -853,6 +891,7 @@ impl Timeline {
         Ok(results)
     }
 
+    /// Not subject to [`Self::timeline_get_throttle`].
     pub(super) async fn validate_get_vectored_impl(
         &self,
         vectored_res: &Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError>,
@@ -2967,7 +3006,6 @@ impl Timeline {
             }
 
             trace!("waking up");
-            let timer = self.metrics.flush_time_histo.start_timer();
             let flush_counter = *layer_flush_start_rx.borrow();
             let result = loop {
                 if self.cancel.is_cancelled() {
@@ -2978,6 +3016,8 @@ impl Timeline {
                     return;
                 }
 
+                let timer = self.metrics.flush_time_histo.start_timer();
+
                 let layer_to_flush = {
                     let guard = self.layers.read().await;
                     guard.layer_map().frozen_layers.front().cloned()
@@ -2999,13 +3039,12 @@ impl Timeline {
                         break err;
                     }
                 }
+                timer.stop_and_record();
             };
             // Notify any listeners that we're done
             let _ = self
                 .layer_flush_done_tx
                 .send_replace((flush_counter, result));
-
-            timer.stop_and_record();
         }
     }
 
@@ -3073,6 +3112,7 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> Result<(), FlushLayerError> {
         debug_assert_current_span_has_tenant_and_timeline_id();
+
         // As a special case, when we have just imported an image into the repository,
         // instead of writing out a L0 delta layer, we directly write out image layer
         // files instead. This is possible as long as *all* the data imported into the
@@ -3744,8 +3784,11 @@ impl Timeline {
                         // The timestamp is in the future. That sounds impossible,
                         // but what it really means is that there hasn't been
                         // any commits since the cutoff timestamp.
+                        //
+                        // In this case we should use the LSN of the most recent commit,
+                        // which is implicitly the last LSN in the log.
                         debug!("future({})", lsn);
-                        cutoff_horizon
+                        self.get_last_record_lsn()
                     }
                     LsnForTimestamp::Past(lsn) => {
                         debug!("past({})", lsn);
diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs
index 27d6fd9c28..e1034a9fe2 100644
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -2,8 +2,8 @@ use std::{collections::hash_map::Entry, fs, sync::Arc};
 
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use tracing::{error, info, info_span, warn};
-use utils::{crashsafe, fs_ext, id::TimelineId, lsn::Lsn};
+use tracing::{error, info, info_span};
+use utils::{fs_ext, id::TimelineId, lsn::Lsn};
 
 use crate::{context::RequestContext, import_datadir, tenant::Tenant};
 
@@ -11,22 +11,22 @@ use super::Timeline;
 
 /// A timeline with some of its files on disk, being initialized.
 /// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or
-/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory
-/// to be removed on next restart.
+/// its local files are removed.  If we crash while this class exists, then the timeline's local
+/// state is cleaned up during [`Tenant::clean_up_timelines`], because the timeline's content isn't in remote storage.
 ///
 /// The caller is responsible for proper timeline data filling before the final init.
 #[must_use]
 pub struct UninitializedTimeline<'t> {
     pub(crate) owning_tenant: &'t Tenant,
     timeline_id: TimelineId,
-    raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark<'t>)>,
+    raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard<'t>)>,
 }
 
 impl<'t> UninitializedTimeline<'t> {
     pub(crate) fn new(
         owning_tenant: &'t Tenant,
         timeline_id: TimelineId,
-        raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark<'t>)>,
+        raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard<'t>)>,
     ) -> Self {
         Self {
             owning_tenant,
@@ -35,8 +35,7 @@ impl<'t> UninitializedTimeline<'t> {
         }
     }
 
-    /// Finish timeline creation: insert it into the Tenant's timelines map and remove the
-    /// uninit mark file.
+    /// Finish timeline creation: insert it into the Tenant's timelines map
     ///
     /// This function launches the flush loop if not already done.
     ///
@@ -72,16 +71,9 @@ impl<'t> UninitializedTimeline<'t> {
             Entry::Vacant(v) => {
                 // after taking here should be no fallible operations, because the drop guard will not
                 // cleanup after and would block for example the tenant deletion
-                let (new_timeline, uninit_mark) =
+                let (new_timeline, _create_guard) =
                     self.raw_timeline.take().expect("already checked");
 
-                // this is the mutual exclusion between different retries to create the timeline;
-                // this should be an assertion.
-                uninit_mark.remove_uninit_mark().with_context(|| {
-                    format!(
-                        "Failed to remove uninit mark file for timeline {tenant_shard_id}/{timeline_id}"
-                    )
-                })?;
                 v.insert(Arc::clone(&new_timeline));
 
                 new_timeline.maybe_spawn_flush_loop();
@@ -120,8 +112,7 @@ impl<'t> UninitializedTimeline<'t> {
             .await
             .context("Failed to flush after basebackup import")?;
 
-        // All the data has been imported. Insert the Timeline into the tenant's timelines
-        // map and remove the uninit mark file.
+        // All the data has been imported. Insert the Timeline into the tenant's timelines map
         let tl = self.finish_creation()?;
         tl.activate(broker_client, None, ctx);
         Ok(tl)
@@ -143,37 +134,35 @@ impl<'t> UninitializedTimeline<'t> {
 
 impl Drop for UninitializedTimeline<'_> {
     fn drop(&mut self) {
-        if let Some((_, uninit_mark)) = self.raw_timeline.take() {
+        if let Some((_, create_guard)) = self.raw_timeline.take() {
             let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered();
             error!("Timeline got dropped without initializing, cleaning its files");
-            cleanup_timeline_directory(uninit_mark);
+            cleanup_timeline_directory(create_guard);
         }
     }
 }
 
-pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
-    let timeline_path = &uninit_mark.timeline_path;
+pub(crate) fn cleanup_timeline_directory(create_guard: TimelineCreateGuard) {
+    let timeline_path = &create_guard.timeline_path;
     match fs_ext::ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
         Ok(()) => {
-            info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
+            info!("Timeline dir {timeline_path:?} removed successfully")
         }
         Err(e) => {
             error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}")
         }
     }
-    drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists
+    // Having cleaned up, we can release this TimelineId in `[Tenant::timelines_creating]` to allow other
+    // timeline creation attempts under this TimelineId to proceed
+    drop(create_guard);
 }
 
-/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
-/// or gets removed eventually.
-///
-/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first.
+/// A guard for timeline creations in process: as long as this object exists, the timeline ID
+/// is kept in `[Tenant::timelines_creating]` to exclude concurrent attempts to create the same timeline.
 #[must_use]
-pub(crate) struct TimelineUninitMark<'t> {
+pub(crate) struct TimelineCreateGuard<'t> {
     owning_tenant: &'t Tenant,
     timeline_id: TimelineId,
-    uninit_mark_deleted: bool,
-    uninit_mark_path: Utf8PathBuf,
     pub(crate) timeline_path: Utf8PathBuf,
 }
 
@@ -190,11 +179,10 @@ pub(crate) enum TimelineExclusionError {
     Other(#[from] anyhow::Error),
 }
 
-impl<'t> TimelineUninitMark<'t> {
+impl<'t> TimelineCreateGuard<'t> {
     pub(crate) fn new(
         owning_tenant: &'t Tenant,
         timeline_id: TimelineId,
-        uninit_mark_path: Utf8PathBuf,
         timeline_path: Utf8PathBuf,
     ) -> Result<Self, TimelineExclusionError> {
         // Lock order: this is the only place we take both locks.  During drop() we only
@@ -214,56 +202,14 @@ impl<'t> TimelineUninitMark<'t> {
             Ok(Self {
                 owning_tenant,
                 timeline_id,
-                uninit_mark_deleted: false,
-                uninit_mark_path,
                 timeline_path,
             })
         }
     }
-
-    fn remove_uninit_mark(mut self) -> anyhow::Result<()> {
-        if !self.uninit_mark_deleted {
-            self.delete_mark_file_if_present()?;
-        }
-
-        Ok(())
-    }
-
-    fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> {
-        let uninit_mark_file = &self.uninit_mark_path;
-        let uninit_mark_parent = uninit_mark_file
-            .parent()
-            .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
-        fs_ext::ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
-            format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
-        })?;
-        crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
-        self.uninit_mark_deleted = true;
-
-        Ok(())
-    }
 }
 
-impl Drop for TimelineUninitMark<'_> {
+impl Drop for TimelineCreateGuard<'_> {
     fn drop(&mut self) {
-        if !self.uninit_mark_deleted {
-            if self.timeline_path.exists() {
-                error!(
-                    "Uninit mark {} is not removed, timeline {} stays uninitialized",
-                    self.uninit_mark_path, self.timeline_path
-                )
-            } else {
-                // unblock later timeline creation attempts
-                warn!(
-                    "Removing intermediate uninit mark file {}",
-                    self.uninit_mark_path
-                );
-                if let Err(e) = self.delete_mark_file_if_present() {
-                    error!("Failed to remove the uninit mark file: {e}")
-                }
-            }
-        }
-
         self.owning_tenant
             .timelines_creating
             .lock()
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 8297ca6563..d9f780cfd1 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -448,6 +448,7 @@ pub(super) async fn handle_walreceiver_connection(
                 disk_consistent_lsn,
                 remote_consistent_lsn,
                 replytime: ts,
+                shard_number: timeline.tenant_shard_id.shard_number.0 as u32,
             };
 
             debug!("neon_status_update {status_update:?}");
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 6d4774cf75..dee36d8afd 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -28,12 +28,31 @@ use tokio::time::Instant;
 
 pub use pageserver_api::models::virtual_file as api;
 pub(crate) mod io_engine;
+pub use io_engine::feature_test as io_engine_feature_test;
+pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
 pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
 
+#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
+pub(crate) mod owned_buffers_io {
+    //! Abstractions for IO with owned buffers.
+    //!
+    //! Not actually tied to [`crate::virtual_file`] specifically, but, it's the primary
+    //! reason we need this abstraction.
+    //!
+    //! Over time, this could move into the `tokio-epoll-uring` crate, maybe `uring-common`,
+    //! but for the time being we're proving out the primitives in the neon.git repo
+    //! for faster iteration.
+
+    pub(crate) mod write;
+    pub(crate) mod util {
+        pub(crate) mod size_tracking_writer;
+    }
+}
+
 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
 /// the underlying file is closed if the system is low on file descriptors,
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index e369d28711..7a27be2ca1 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -6,6 +6,11 @@
 //! Initialize using [`init`].
 //!
 //! Then use [`get`] and  [`super::OpenOptions`].
+//!
+//!
+
+#[cfg(target_os = "linux")]
+pub(super) mod tokio_epoll_uring_ext;
 
 use tokio_epoll_uring::{IoBuf, Slice};
 use tracing::Instrument;
@@ -145,7 +150,7 @@ impl IoEngine {
             }
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
-                let system = tokio_epoll_uring::thread_local_system().await;
+                let system = tokio_epoll_uring_ext::thread_local_system().await;
                 let (resources, res) = system.read(file_guard, offset, buf).await;
                 (resources, res.map_err(epoll_uring_error_to_std))
             }
@@ -160,7 +165,7 @@ impl IoEngine {
             }
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
-                let system = tokio_epoll_uring::thread_local_system().await;
+                let system = tokio_epoll_uring_ext::thread_local_system().await;
                 let (resources, res) = system.fsync(file_guard).await;
                 (resources, res.map_err(epoll_uring_error_to_std))
             }
@@ -178,7 +183,7 @@ impl IoEngine {
             }
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
-                let system = tokio_epoll_uring::thread_local_system().await;
+                let system = tokio_epoll_uring_ext::thread_local_system().await;
                 let (resources, res) = system.fdatasync(file_guard).await;
                 (resources, res.map_err(epoll_uring_error_to_std))
             }
@@ -197,7 +202,7 @@ impl IoEngine {
             }
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
-                let system = tokio_epoll_uring::thread_local_system().await;
+                let system = tokio_epoll_uring_ext::thread_local_system().await;
                 let (resources, res) = system.statx(file_guard).await;
                 (
                     resources,
@@ -220,7 +225,7 @@ impl IoEngine {
             }
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
-                let system = tokio_epoll_uring::thread_local_system().await;
+                let system = tokio_epoll_uring_ext::thread_local_system().await;
                 let (resources, res) = system.write(file_guard, offset, buf).await;
                 (resources, res.map_err(epoll_uring_error_to_std))
             }
@@ -253,3 +258,82 @@ impl IoEngine {
         }
     }
 }
+
+pub enum FeatureTestResult {
+    PlatformPreferred(IoEngineKind),
+    Worse {
+        engine: IoEngineKind,
+        remark: String,
+    },
+}
+
+impl FeatureTestResult {
+    #[cfg(target_os = "linux")]
+    const PLATFORM_PREFERRED: IoEngineKind = IoEngineKind::TokioEpollUring;
+    #[cfg(not(target_os = "linux"))]
+    const PLATFORM_PREFERRED: IoEngineKind = IoEngineKind::StdFs;
+}
+
+impl From<FeatureTestResult> for IoEngineKind {
+    fn from(val: FeatureTestResult) -> Self {
+        match val {
+            FeatureTestResult::PlatformPreferred(e) => e,
+            FeatureTestResult::Worse { engine, .. } => engine,
+        }
+    }
+}
+
+/// Somewhat costly under the hood, do only once.
+/// Panics if we can't set up the feature test.
+pub fn feature_test() -> anyhow::Result<FeatureTestResult> {
+    std::thread::spawn(|| {
+
+        #[cfg(not(target_os = "linux"))]
+        {
+            Ok(FeatureTestResult::PlatformPreferred(
+                FeatureTestResult::PLATFORM_PREFERRED,
+            ))
+        }
+        #[cfg(target_os = "linux")]
+        {
+            let rt = tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()
+                .unwrap();
+            Ok(match rt.block_on(tokio_epoll_uring::System::launch()) {
+                Ok(_) => FeatureTestResult::PlatformPreferred({
+                    assert!(matches!(
+                        IoEngineKind::TokioEpollUring,
+                        FeatureTestResult::PLATFORM_PREFERRED
+                    ));
+                    FeatureTestResult::PLATFORM_PREFERRED
+                }),
+                Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) => {
+                    let remark = match e.raw_os_error() {
+                        Some(nix::libc::EPERM) => {
+                            // fall back
+                            "creating tokio-epoll-uring fails with EPERM, assuming it's admin-disabled "
+                                .to_string()
+                        }
+                    Some(nix::libc::EFAULT) => {
+                            // fail feature test
+                            anyhow::bail!(
+                                "creating tokio-epoll-uring fails with EFAULT, might have corrupted memory"
+                            );
+                        }
+                        Some(_) | None => {
+                            // fall back
+                            format!("creating tokio-epoll-uring fails with error: {e:#}")
+                        }
+                };
+                    FeatureTestResult::Worse {
+                        engine: IoEngineKind::StdFs,
+                        remark,
+                    }
+                }
+            })
+        }
+    })
+    .join()
+    .unwrap()
+}
diff --git a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
new file mode 100644
index 0000000000..6ea19d6b2d
--- /dev/null
+++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
@@ -0,0 +1,194 @@
+//! Like [`::tokio_epoll_uring::thread_local_system()`], but with pageserver-specific
+//! handling in case the instance can't launched.
+//!
+//! This is primarily necessary due to ENOMEM aka OutOfMemory errors during io_uring creation
+//! on older kernels, such as some (but not all) older kernels in the Linux 5.10 series.
+//! See <https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391> for more details.
+
+use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
+use std::sync::Arc;
+
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info, info_span, warn, Instrument};
+use utils::backoff::{DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
+
+use tokio_epoll_uring::{System, SystemHandle};
+
+use crate::virtual_file::on_fatal_io_error;
+
+use crate::metrics::tokio_epoll_uring as metrics;
+
+#[derive(Clone)]
+struct ThreadLocalState(Arc<ThreadLocalStateInner>);
+
+struct ThreadLocalStateInner {
+    cell: tokio::sync::OnceCell<SystemHandle>,
+    launch_attempts: AtomicU32,
+    /// populated through fetch_add from [`THREAD_LOCAL_STATE_ID`]
+    thread_local_state_id: u64,
+}
+
+impl ThreadLocalState {
+    pub fn new() -> Self {
+        Self(Arc::new(ThreadLocalStateInner {
+            cell: tokio::sync::OnceCell::default(),
+            launch_attempts: AtomicU32::new(0),
+            thread_local_state_id: THREAD_LOCAL_STATE_ID.fetch_add(1, Ordering::Relaxed),
+        }))
+    }
+
+    pub fn make_id_string(&self) -> String {
+        format!("{}", self.0.thread_local_state_id)
+    }
+}
+
+static THREAD_LOCAL_STATE_ID: AtomicU64 = AtomicU64::new(0);
+
+thread_local! {
+    static THREAD_LOCAL: ThreadLocalState = ThreadLocalState::new();
+}
+
+/// Panics if we cannot [`System::launch`].
+pub async fn thread_local_system() -> Handle {
+    let fake_cancel = CancellationToken::new();
+    loop {
+        let thread_local_state = THREAD_LOCAL.with(|arc| arc.clone());
+        let inner = &thread_local_state.0;
+        let get_or_init_res = inner
+            .cell
+            .get_or_try_init(|| async {
+                let attempt_no = inner
+                    .launch_attempts
+                    .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+                let span = info_span!("tokio_epoll_uring_ext::thread_local_system", thread_local=%thread_local_state.make_id_string(), %attempt_no);
+                async {
+                    // Rate-limit retries per thread-local.
+                    // NB: doesn't yield to executor at attempt_no=0.
+                    utils::backoff::exponential_backoff(
+                        attempt_no,
+                        DEFAULT_BASE_BACKOFF_SECONDS,
+                        DEFAULT_MAX_BACKOFF_SECONDS,
+                        &fake_cancel,
+                    )
+                    .await;
+                    let res = System::launch()
+                    // this might move us to another executor thread => loop outside the get_or_try_init, not inside it
+                    .await;
+                    match res {
+                        Ok(system) => {
+                            info!("successfully launched system");
+                            metrics::THREAD_LOCAL_LAUNCH_SUCCESSES.inc();
+                            Ok(system)
+                        }
+                        Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => {
+                            warn!("not enough locked memory to tokio-epoll-uring, will retry");
+                            info_span!("stats").in_scope(|| {
+                                emit_launch_failure_process_stats();
+                            });
+                            metrics::THREAD_LOCAL_LAUNCH_FAILURES.inc();
+                            Err(())
+                        }
+                        // abort the process instead of panicking because pageserver usually becomes half-broken if we panic somewhere.
+                        // This is equivalent to a fatal IO error.
+                        Err(ref e @ tokio_epoll_uring::LaunchResult::IoUringBuild(ref inner)) => {
+                            error!(error=%e, "failed to launch thread-local tokio-epoll-uring, this should not happen, aborting process");
+                            info_span!("stats").in_scope(|| {
+                                emit_launch_failure_process_stats();
+                            });
+                            on_fatal_io_error(inner, "launch thread-local tokio-epoll-uring");
+                        },
+                    }
+                }
+                .instrument(span)
+                .await
+            })
+            .await;
+        if get_or_init_res.is_ok() {
+            return Handle(thread_local_state);
+        }
+    }
+}
+
+fn emit_launch_failure_process_stats() {
+    // tokio-epoll-uring stats
+    // vmlck + rlimit
+    // number of threads
+    // rss / system memory usage generally
+
+    let tokio_epoll_uring::metrics::Metrics {
+        systems_created,
+        systems_destroyed,
+    } = tokio_epoll_uring::metrics::global();
+    info!(systems_created, systems_destroyed, "tokio-epoll-uring");
+
+    match procfs::process::Process::myself() {
+        Ok(myself) => {
+            match myself.limits() {
+                Ok(limits) => {
+                    info!(?limits.max_locked_memory, "/proc/self/limits");
+                }
+                Err(error) => {
+                    info!(%error, "no limit stats due to error");
+                }
+            }
+
+            match myself.status() {
+                Ok(status) => {
+                    let procfs::process::Status {
+                        vmsize,
+                        vmlck,
+                        vmpin,
+                        vmrss,
+                        rssanon,
+                        rssfile,
+                        rssshmem,
+                        vmdata,
+                        vmstk,
+                        vmexe,
+                        vmlib,
+                        vmpte,
+                        threads,
+                        ..
+                    } = status;
+                    info!(
+                        vmsize,
+                        vmlck,
+                        vmpin,
+                        vmrss,
+                        rssanon,
+                        rssfile,
+                        rssshmem,
+                        vmdata,
+                        vmstk,
+                        vmexe,
+                        vmlib,
+                        vmpte,
+                        threads,
+                        "/proc/self/status"
+                    );
+                }
+                Err(error) => {
+                    info!(%error, "no status status due to error");
+                }
+            }
+        }
+        Err(error) => {
+            info!(%error, "no process stats due to error");
+        }
+    };
+}
+
+#[derive(Clone)]
+pub struct Handle(ThreadLocalState);
+
+impl std::ops::Deref for Handle {
+    type Target = SystemHandle;
+
+    fn deref(&self) -> &Self::Target {
+        self.0
+             .0
+            .cell
+            .get()
+            .expect("must be already initialized when using this")
+    }
+}
diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs
index f75edb0bac..7f951270d1 100644
--- a/pageserver/src/virtual_file/open_options.rs
+++ b/pageserver/src/virtual_file/open_options.rs
@@ -98,7 +98,7 @@ impl OpenOptions {
             OpenOptions::StdFs(x) => x.open(path).map(|file| file.into()),
             #[cfg(target_os = "linux")]
             OpenOptions::TokioEpollUring(x) => {
-                let system = tokio_epoll_uring::thread_local_system().await;
+                let system = super::io_engine::tokio_epoll_uring_ext::thread_local_system().await;
                 system.open(path, x).await.map_err(|e| match e {
                     tokio_epoll_uring::Error::Op(e) => e,
                     tokio_epoll_uring::Error::System(system) => {
diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
new file mode 100644
index 0000000000..7505b7487e
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
@@ -0,0 +1,34 @@
+use crate::virtual_file::{owned_buffers_io::write::OwnedAsyncWriter, VirtualFile};
+use tokio_epoll_uring::{BoundedBuf, IoBuf};
+
+pub struct Writer {
+    dst: VirtualFile,
+    bytes_amount: u64,
+}
+
+impl Writer {
+    pub fn new(dst: VirtualFile) -> Self {
+        Self {
+            dst,
+            bytes_amount: 0,
+        }
+    }
+    /// Returns the wrapped `VirtualFile` object as well as the number
+    /// of bytes that were written to it through this object.
+    pub fn into_inner(self) -> (u64, VirtualFile) {
+        (self.bytes_amount, self.dst)
+    }
+}
+
+impl OwnedAsyncWriter for Writer {
+    #[inline(always)]
+    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        buf: B,
+    ) -> std::io::Result<(usize, B::Buf)> {
+        let (buf, res) = self.dst.write_all(buf).await;
+        let nwritten = res?;
+        self.bytes_amount += u64::try_from(nwritten).unwrap();
+        Ok((nwritten, buf))
+    }
+}
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
new file mode 100644
index 0000000000..f1812d9b51
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -0,0 +1,206 @@
+use bytes::BytesMut;
+use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
+
+/// A trait for doing owned-buffer write IO.
+/// Think [`tokio::io::AsyncWrite`] but with owned buffers.
+pub trait OwnedAsyncWriter {
+    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        buf: B,
+    ) -> std::io::Result<(usize, B::Buf)>;
+}
+
+/// A wrapper aorund an [`OwnedAsyncWriter`] that batches smaller writers
+/// into `BUFFER_SIZE`-sized writes.
+///
+/// # Passthrough Of Large Writers
+///
+/// Buffered writes larger than the `BUFFER_SIZE` cause the internal
+/// buffer to be flushed, even if it is not full yet. Then, the large
+/// buffered write is passed through to the unerlying [`OwnedAsyncWriter`].
+///
+/// This pass-through is generally beneficial for throughput, but if
+/// the storage backend of the [`OwnedAsyncWriter`] is a shared resource,
+/// unlimited large writes may cause latency or fairness issues.
+///
+/// In such cases, a different implementation that always buffers in memory
+/// may be preferable.
+pub struct BufferedWriter<const BUFFER_SIZE: usize, W> {
+    writer: W,
+    // invariant: always remains Some(buf)
+    // with buf.capacity() == BUFFER_SIZE except
+    // - while IO is ongoing => goes back to Some() once the IO completed successfully
+    // - after an IO error => stays `None` forever
+    // In these exceptional cases, it's `None`.
+    buf: Option<BytesMut>,
+}
+
+impl<const BUFFER_SIZE: usize, W> BufferedWriter<BUFFER_SIZE, W>
+where
+    W: OwnedAsyncWriter,
+{
+    pub fn new(writer: W) -> Self {
+        Self {
+            writer,
+            buf: Some(BytesMut::with_capacity(BUFFER_SIZE)),
+        }
+    }
+
+    pub async fn flush_and_into_inner(mut self) -> std::io::Result<W> {
+        self.flush().await?;
+        let Self { buf, writer } = self;
+        assert!(buf.is_some());
+        Ok(writer)
+    }
+
+    pub async fn write_buffered<B: IoBuf>(&mut self, chunk: Slice<B>) -> std::io::Result<()>
+    where
+        B: IoBuf + Send,
+    {
+        // avoid memcpy for the middle of the chunk
+        if chunk.len() >= BUFFER_SIZE {
+            self.flush().await?;
+            // do a big write, bypassing `buf`
+            assert_eq!(
+                self.buf
+                    .as_ref()
+                    .expect("must not use after an error")
+                    .len(),
+                0
+            );
+            let chunk_len = chunk.len();
+            let (nwritten, chunk) = self.writer.write_all(chunk).await?;
+            assert_eq!(nwritten, chunk_len);
+            drop(chunk);
+            return Ok(());
+        }
+        // in-memory copy the < BUFFER_SIZED tail of the chunk
+        assert!(chunk.len() < BUFFER_SIZE);
+        let mut chunk = &chunk[..];
+        while !chunk.is_empty() {
+            let buf = self.buf.as_mut().expect("must not use after an error");
+            let need = BUFFER_SIZE - buf.len();
+            let have = chunk.len();
+            let n = std::cmp::min(need, have);
+            buf.extend_from_slice(&chunk[..n]);
+            chunk = &chunk[n..];
+            if buf.len() >= BUFFER_SIZE {
+                assert_eq!(buf.len(), BUFFER_SIZE);
+                self.flush().await?;
+            }
+        }
+        assert!(chunk.is_empty(), "by now we should have drained the chunk");
+        Ok(())
+    }
+
+    async fn flush(&mut self) -> std::io::Result<()> {
+        let buf = self.buf.take().expect("must not use after an error");
+        if buf.is_empty() {
+            self.buf = Some(buf);
+            return std::io::Result::Ok(());
+        }
+        let buf_len = buf.len();
+        let (nwritten, mut buf) = self.writer.write_all(buf).await?;
+        assert_eq!(nwritten, buf_len);
+        buf.clear();
+        self.buf = Some(buf);
+        Ok(())
+    }
+}
+
+impl OwnedAsyncWriter for Vec<u8> {
+    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        buf: B,
+    ) -> std::io::Result<(usize, B::Buf)> {
+        let nbytes = buf.bytes_init();
+        if nbytes == 0 {
+            return Ok((0, Slice::into_inner(buf.slice_full())));
+        }
+        let buf = buf.slice(0..nbytes);
+        self.extend_from_slice(&buf[..]);
+        Ok((buf.len(), Slice::into_inner(buf)))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[derive(Default)]
+    struct RecorderWriter {
+        writes: Vec<Vec<u8>>,
+    }
+    impl OwnedAsyncWriter for RecorderWriter {
+        async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+            &mut self,
+            buf: B,
+        ) -> std::io::Result<(usize, B::Buf)> {
+            let nbytes = buf.bytes_init();
+            if nbytes == 0 {
+                self.writes.push(vec![]);
+                return Ok((0, Slice::into_inner(buf.slice_full())));
+            }
+            let buf = buf.slice(0..nbytes);
+            self.writes.push(Vec::from(&buf[..]));
+            Ok((buf.len(), Slice::into_inner(buf)))
+        }
+    }
+
+    macro_rules! write {
+        ($writer:ident, $data:literal) => {{
+            $writer
+                .write_buffered(::bytes::Bytes::from_static($data).slice_full())
+                .await?;
+        }};
+    }
+
+    #[tokio::test]
+    async fn test_buffered_writes_only() -> std::io::Result<()> {
+        let recorder = RecorderWriter::default();
+        let mut writer = BufferedWriter::<2, _>::new(recorder);
+        write!(writer, b"a");
+        write!(writer, b"b");
+        write!(writer, b"c");
+        write!(writer, b"d");
+        write!(writer, b"e");
+        let recorder = writer.flush_and_into_inner().await?;
+        assert_eq!(
+            recorder.writes,
+            vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")]
+        );
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_passthrough_writes_only() -> std::io::Result<()> {
+        let recorder = RecorderWriter::default();
+        let mut writer = BufferedWriter::<2, _>::new(recorder);
+        write!(writer, b"abc");
+        write!(writer, b"de");
+        write!(writer, b"");
+        write!(writer, b"fghijk");
+        let recorder = writer.flush_and_into_inner().await?;
+        assert_eq!(
+            recorder.writes,
+            vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")]
+        );
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> {
+        let recorder = RecorderWriter::default();
+        let mut writer = BufferedWriter::<2, _>::new(recorder);
+        write!(writer, b"a");
+        write!(writer, b"bc");
+        write!(writer, b"d");
+        write!(writer, b"e");
+        let recorder = writer.flush_and_into_inner().await?;
+        assert_eq!(
+            recorder.writes,
+            vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")]
+        );
+        Ok(())
+    }
+}
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 63a2b30d09..9c7e8748d5 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -109,6 +109,8 @@ impl WalIngest {
             self.checkpoint_modified = true;
         }
 
+        failpoint_support::sleep_millis_async!("wal-ingest-record-sleep");
+
         match decoded.xl_rmid {
             pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
                 // Heap AM records need some special handling, because they modify VM pages
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 9ff0493352..d7987954d4 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -70,7 +70,7 @@ static bool SendAppendRequests(Safekeeper *sk);
 static bool RecvAppendResponses(Safekeeper *sk);
 static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp);
 static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp);
-static void HandleSafekeeperResponse(WalProposer *wp);
+static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk);
 static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size);
 static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg);
 static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state);
@@ -1405,7 +1405,6 @@ static bool
 RecvAppendResponses(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
-	XLogRecPtr	newCommitLsn;
 	bool		readAnything = false;
 
 	while (true)
@@ -1425,6 +1424,8 @@ RecvAppendResponses(Safekeeper *sk)
 			   LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
 			   sk->host, sk->port);
 
+		readAnything = true;
+
 		if (sk->appendResponse.term > wp->propTerm)
 		{
 			/*
@@ -1438,35 +1439,28 @@ RecvAppendResponses(Safekeeper *sk)
 				   sk->appendResponse.term, wp->propTerm);
 		}
 
-		readAnything = true;
+		HandleSafekeeperResponse(wp, sk);
 	}
 
 	if (!readAnything)
 		return sk->state == SS_ACTIVE;
 
-	/* update commit_lsn */
-	newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp);
-	/*
-	 * Send the new value to all safekeepers.
-	 */
-	if (newCommitLsn > wp->commitLsn)
-	{
-		wp->commitLsn = newCommitLsn;
-		BroadcastAppendRequest(wp);
-	}
-
-	HandleSafekeeperResponse(wp);
-
 	return sk->state == SS_ACTIVE;
 }
 
+#define psfeedback_log(fmt, key, ...) \
+	wp_log(DEBUG2, "ParsePageserverFeedbackMessage: %s " fmt, key, __VA_ARGS__)
+
 /* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */
 static void
-ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *rf)
+ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *ps_feedback)
 {
 	uint8		nkeys;
 	int			i;
-	int32		len;
+
+	/* initialize the struct before parsing */
+	memset(ps_feedback, 0, sizeof(PageserverFeedback));
+	ps_feedback->present = true;
 
 	/* get number of custom keys */
 	nkeys = pq_getmsgbyte(reply_message);
@@ -1474,66 +1468,52 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 	for (i = 0; i < nkeys; i++)
 	{
 		const char *key = pq_getmsgstring(reply_message);
+		unsigned int value_len = pq_getmsgint(reply_message, sizeof(int32));
 
 		if (strcmp(key, "current_timeline_size") == 0)
 		{
-			pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-			rf->currentClusterSize = pq_getmsgint64(reply_message);
-			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
-				   rf->currentClusterSize);
+			Assert(value_len == sizeof(int64));
+			ps_feedback->currentClusterSize = pq_getmsgint64(reply_message);
+			psfeedback_log(UINT64_FORMAT, key, ps_feedback->currentClusterSize);
 		}
 		else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
 		{
-			pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-			rf->last_received_lsn = pq_getmsgint64(reply_message);
-			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
-				   LSN_FORMAT_ARGS(rf->last_received_lsn));
+			Assert(value_len == sizeof(int64));
+			ps_feedback->last_received_lsn = pq_getmsgint64(reply_message);
+			psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->last_received_lsn));
 		}
 		else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
 		{
-			pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-			rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
-			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
-				   LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
+			Assert(value_len == sizeof(int64));
+			ps_feedback->disk_consistent_lsn = pq_getmsgint64(reply_message);
+			psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->disk_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
 		{
-			pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-			rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
-			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
-				   LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
+			Assert(value_len == sizeof(int64));
+			ps_feedback->remote_consistent_lsn = pq_getmsgint64(reply_message);
+			psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->remote_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
 		{
-			pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-			rf->replytime = pq_getmsgint64(reply_message);
-			{
-				char	   *replyTimeStr;
-
-				/* Copy because timestamptz_to_str returns a static buffer */
-				replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
-				wp_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
-					   rf->replytime, replyTimeStr);
-
-				pfree(replyTimeStr);
-			}
+			Assert(value_len == sizeof(int64));
+			ps_feedback->replytime = pq_getmsgint64(reply_message);
+			psfeedback_log("%s", key, timestamptz_to_str(ps_feedback->replytime));
+		}
+		else if (strcmp(key, "shard_number") == 0)
+		{
+			Assert(value_len == sizeof(uint32));
+			ps_feedback->shard_number = pq_getmsgint(reply_message, sizeof(uint32));
+			psfeedback_log("%u", key, ps_feedback->shard_number);
 		}
 		else
 		{
-			len = pq_getmsgint(reply_message, sizeof(int32));
-			/* read value length */
-
 			/*
 			 * Skip unknown keys to support backward compatibile protocol
 			 * changes
 			 */
-			wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
-			pq_getmsgbytes(reply_message, len);
+			wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, value_len);
+			pq_getmsgbytes(reply_message, value_len);
 		};
 	}
 }
@@ -1630,12 +1610,30 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
 	return donor;
 }
 
+/*
+ * Process AppendResponse message from safekeeper.
+ */
 static void
-HandleSafekeeperResponse(WalProposer *wp)
+HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk)
 {
 	XLogRecPtr	candidateTruncateLsn;
+	XLogRecPtr  newCommitLsn;
 
-	wp->api.process_safekeeper_feedback(wp);
+	newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp);
+	if (newCommitLsn > wp->commitLsn)
+	{
+		wp->commitLsn = newCommitLsn;
+		/* Send new value to all safekeepers. */
+		BroadcastAppendRequest(wp);
+	}
+
+	/* 
+	 * Unlock syncrep waiters, update ps_feedback, CheckGracefulShutdown().
+	 * The last one will terminate the process if the shutdown is requested
+	 * and WAL is committed by the quorum. BroadcastAppendRequest() should be
+	 * called to notify safekeepers about the new commitLsn.
+	 */
+	wp->api.process_safekeeper_feedback(wp, sk);
 
 	/*
 	 * Try to advance truncateLsn -- the last record flushed to all
@@ -1811,8 +1809,10 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 				msg->hs.ts = pq_getmsgint64_le(&s);
 				msg->hs.xmin.value = pq_getmsgint64_le(&s);
 				msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
-				if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE)
-					ParsePageserverFeedbackMessage(wp, &s, &msg->rf);
+				if (s.len > s.cursor)
+					ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
+				else
+					msg->ps_feedback.present = false;
 				pq_getmsgend(&s);
 				return true;
 			}
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index bc674fd979..28585eb4e7 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -10,6 +10,7 @@
 
 #include "libpqwalproposer.h"
 #include "neon_walreader.h"
+#include "pagestore_client.h"
 
 #define SK_MAGIC 0xCafeCeefu
 #define SK_PROTOCOL_VERSION 2
@@ -269,6 +270,8 @@ typedef struct HotStandbyFeedback
 
 typedef struct PageserverFeedback
 {
+	/* true if AppendResponse contains this feedback */
+	bool		present;
 	/* current size of the timeline on pageserver */
 	uint64		currentClusterSize;
 	/* standby_status_update fields that safekeeper received from pageserver */
@@ -276,14 +279,21 @@ typedef struct PageserverFeedback
 	XLogRecPtr	disk_consistent_lsn;
 	XLogRecPtr	remote_consistent_lsn;
 	TimestampTz replytime;
+	uint32		shard_number;
 } PageserverFeedback;
 
 typedef struct WalproposerShmemState
 {
 	slock_t		mutex;
-	PageserverFeedback feedback;
 	term_t		mineLastElectedTerm;
 	pg_atomic_uint64 backpressureThrottlingTime;
+
+	/* last feedback from each shard */
+	PageserverFeedback shard_ps_feedback[MAX_SHARDS];
+	int num_shards;
+
+	/* aggregated feedback with min LSNs across shards */
+	PageserverFeedback min_ps_feedback;
 } WalproposerShmemState;
 
 /*
@@ -307,12 +317,12 @@ typedef struct AppendResponse
 	/* Feedback received from pageserver includes standby_status_update fields */
 	/* and custom neon feedback. */
 	/* This part of the message is extensible. */
-	PageserverFeedback rf;
+	PageserverFeedback ps_feedback;
 } AppendResponse;
 
 /*  PageserverFeedback is extensible part of the message that is parsed separately */
 /*  Other fields are fixed part */
-#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf)
+#define APPENDRESPONSE_FIXEDPART_SIZE 56
 
 struct WalProposer;
 typedef struct WalProposer WalProposer;
@@ -560,11 +570,11 @@ typedef struct walproposer_api
 	void		(*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn);
 
 	/*
-	 * Called after every new message from the safekeeper. Used to propagate
+	 * Called after every AppendResponse from the safekeeper. Used to propagate
 	 * backpressure feedback and to confirm WAL persistence (has been commited
 	 * on the quorum of safekeepers).
 	 */
-	void		(*process_safekeeper_feedback) (WalProposer *wp);
+	void		(*process_safekeeper_feedback) (WalProposer *wp, Safekeeper *sk);
 
 	/*
 	 * Write a log message to the internal log processor. This is used only
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 8eec2f02c1..c46fd9b3ec 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -63,7 +63,6 @@ char	   *wal_acceptors_list = "";
 int			wal_acceptor_reconnect_timeout = 1000;
 int			wal_acceptor_connection_timeout = 10000;
 
-static AppendResponse quorumFeedback;
 static WalproposerShmemState *walprop_shared;
 static WalProposerConfig walprop_config;
 static XLogRecPtr sentPtr = InvalidXLogRecPtr;
@@ -71,6 +70,10 @@ static const walproposer_api walprop_pg;
 static volatile sig_atomic_t got_SIGUSR2 = false;
 static bool reported_sigusr2 = false;
 
+static XLogRecPtr standby_flush_lsn = InvalidXLogRecPtr;
+static XLogRecPtr standby_apply_lsn = InvalidXLogRecPtr;
+static HotStandbyFeedback agg_hs_feedback;
+
 static void nwp_shmem_startup_hook(void);
 static void nwp_register_gucs(void);
 static void nwp_prepare_shmem(void);
@@ -402,21 +405,58 @@ walprop_pg_get_shmem_state(WalProposer *wp)
 	return walprop_shared;
 }
 
-static void
-replication_feedback_set(PageserverFeedback *rf)
+/*
+ * Record new ps_feedback in the array with shards and update min_feedback.
+ */
+static PageserverFeedback
+record_pageserver_feedback(PageserverFeedback *ps_feedback)
 {
+	PageserverFeedback min_feedback;
+
+	Assert(ps_feedback->present);
+	Assert(ps_feedback->shard_number < MAX_SHARDS);
+
 	SpinLockAcquire(&walprop_shared->mutex);
-	memcpy(&walprop_shared->feedback, rf, sizeof(PageserverFeedback));
+
+	/* Update the number of shards */
+	if (ps_feedback->shard_number + 1 > walprop_shared->num_shards)
+		walprop_shared->num_shards = ps_feedback->shard_number + 1;
+
+	/* Update the feedback */
+	memcpy(&walprop_shared->shard_ps_feedback[ps_feedback->shard_number], ps_feedback, sizeof(PageserverFeedback));
+
+	/* Calculate min LSNs */
+	memcpy(&min_feedback, ps_feedback, sizeof(PageserverFeedback));
+	for (int i = 0; i < walprop_shared->num_shards; i++)
+	{
+		PageserverFeedback *feedback = &walprop_shared->shard_ps_feedback[i];
+		if (feedback->present)
+		{
+			if (min_feedback.last_received_lsn == InvalidXLogRecPtr || feedback->last_received_lsn < min_feedback.last_received_lsn)
+				min_feedback.last_received_lsn = feedback->last_received_lsn;
+			
+			if (min_feedback.disk_consistent_lsn == InvalidXLogRecPtr || feedback->disk_consistent_lsn < min_feedback.disk_consistent_lsn)
+				min_feedback.disk_consistent_lsn = feedback->disk_consistent_lsn;
+			
+			if (min_feedback.remote_consistent_lsn == InvalidXLogRecPtr || feedback->remote_consistent_lsn < min_feedback.remote_consistent_lsn)
+				min_feedback.remote_consistent_lsn = feedback->remote_consistent_lsn;
+		}
+	}
+	/* Copy min_feedback back to shmem */
+	memcpy(&walprop_shared->min_ps_feedback, &min_feedback, sizeof(PageserverFeedback));
+
 	SpinLockRelease(&walprop_shared->mutex);
+
+	return min_feedback;
 }
 
 void
 replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn)
 {
 	SpinLockAcquire(&walprop_shared->mutex);
-	*writeLsn = walprop_shared->feedback.last_received_lsn;
-	*flushLsn = walprop_shared->feedback.disk_consistent_lsn;
-	*applyLsn = walprop_shared->feedback.remote_consistent_lsn;
+	*writeLsn = walprop_shared->min_ps_feedback.last_received_lsn;
+	*flushLsn = walprop_shared->min_ps_feedback.disk_consistent_lsn;
+	*applyLsn = walprop_shared->min_ps_feedback.remote_consistent_lsn;
 	SpinLockRelease(&walprop_shared->mutex);
 }
 
@@ -1869,39 +1909,6 @@ CheckGracefulShutdown(WalProposer *wp)
 	}
 }
 
-/*
- * Choose most advanced PageserverFeedback and set it to *rf.
- */
-static void
-GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
-{
-	int			latest_safekeeper = 0;
-	XLogRecPtr	last_received_lsn = InvalidXLogRecPtr;
-
-	for (int i = 0; i < wp->n_safekeepers; i++)
-	{
-		if (wp->safekeeper[i].appendResponse.rf.last_received_lsn > last_received_lsn)
-		{
-			latest_safekeeper = i;
-			last_received_lsn = wp->safekeeper[i].appendResponse.rf.last_received_lsn;
-		}
-	}
-
-	rf->currentClusterSize = wp->safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize;
-	rf->last_received_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.last_received_lsn;
-	rf->disk_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.disk_consistent_lsn;
-	rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn;
-	rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime;
-
-	wpg_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
-			" last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
-			rf->currentClusterSize,
-			LSN_FORMAT_ARGS(rf->last_received_lsn),
-			LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
-			LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
-			rf->replytime);
-}
-
 /*
  * Combine hot standby feedbacks from all safekeepers.
  */
@@ -1949,26 +1956,38 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
  * None of that is functional in sync-safekeepers.
  */
 static void
-walprop_pg_process_safekeeper_feedback(WalProposer *wp)
+walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
 {
-	HotStandbyFeedback hsFeedback;
-	XLogRecPtr	oldDiskConsistentLsn;
+	HotStandbyFeedback	hsFeedback;
+	bool				needToAdvanceSlot = false;
 
 	if (wp->config->syncSafekeepers)
 		return;
 
-	oldDiskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;
-
-	/* Get PageserverFeedback fields from the most advanced safekeeper */
-	GetLatestNeonFeedback(&quorumFeedback.rf, wp);
-	replication_feedback_set(&quorumFeedback.rf);
-	SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
-
-	if (wp->commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
+	/* handle fresh ps_feedback */
+	if (sk->appendResponse.ps_feedback.present)
 	{
-		if (wp->commitLsn > quorumFeedback.flushLsn)
-			quorumFeedback.flushLsn = wp->commitLsn;
+		PageserverFeedback min_feedback = record_pageserver_feedback(&sk->appendResponse.ps_feedback);
 
+		/* Only one main shard sends non-zero currentClusterSize */
+		if (sk->appendResponse.ps_feedback.currentClusterSize > 0)
+			SetZenithCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize);
+
+		if (min_feedback.disk_consistent_lsn != standby_apply_lsn)
+		{
+			standby_apply_lsn = min_feedback.disk_consistent_lsn;
+			needToAdvanceSlot = true;
+		}
+	}
+
+	if (wp->commitLsn > standby_flush_lsn)
+	{
+		standby_flush_lsn = wp->commitLsn;
+		needToAdvanceSlot = true;
+	}
+
+	if (needToAdvanceSlot)
+	{
 		/*
 		 * Advance the replication slot to commitLsn. WAL before it is
 		 * hardened and will be fetched from one of safekeepers by
@@ -1977,23 +1996,23 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp)
 		 * Also wakes up syncrep waiters.
 		 */
 		ProcessStandbyReply(
-		/* write_lsn -  This is what durably stored in WAL service. */
-							quorumFeedback.flushLsn,
-		/* flush_lsn - This is what durably stored in WAL service. */
-							quorumFeedback.flushLsn,
+		/* write_lsn -  This is what durably stored in safekeepers quorum. */
+							standby_flush_lsn,
+		/* flush_lsn - This is what durably stored in safekeepers quorum. */
+							standby_flush_lsn,
 
 		/*
 		 * apply_lsn - This is what processed and durably saved at*
 		 * pageserver.
 		 */
-							quorumFeedback.rf.disk_consistent_lsn,
+							standby_apply_lsn,
 							walprop_pg_get_current_timestamp(wp), false);
 	}
 
 	CombineHotStanbyFeedbacks(&hsFeedback, wp);
-	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0)
+	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
 	{
-		quorumFeedback.hs = hsFeedback;
+		agg_hs_feedback = hsFeedback;
 		ProcessStandbyHSFeedback(hsFeedback.ts,
 								 XidFromFullTransactionId(hsFeedback.xmin),
 								 EpochFromFullTransactionId(hsFeedback.xmin),
diff --git a/poetry.lock b/poetry.lock
index 832d7c4334..7b49daf42a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2182,7 +2182,6 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2529,6 +2528,87 @@ docs = ["Sphinx (>=3.4)", "sphinx-rtd-theme (>=0.5)"]
 optional = ["python-socks", "wsaccel"]
 test = ["websockets"]
 
+[[package]]
+name = "websockets"
+version = "12.0"
+description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "websockets-12.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d554236b2a2006e0ce16315c16eaa0d628dab009c33b63ea03f41c6107958374"},
+    {file = "websockets-12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2d225bb6886591b1746b17c0573e29804619c8f755b5598d875bb4235ea639be"},
+    {file = "websockets-12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eb809e816916a3b210bed3c82fb88eaf16e8afcf9c115ebb2bacede1797d2547"},
+    {file = "websockets-12.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c588f6abc13f78a67044c6b1273a99e1cf31038ad51815b3b016ce699f0d75c2"},
+    {file = "websockets-12.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5aa9348186d79a5f232115ed3fa9020eab66d6c3437d72f9d2c8ac0c6858c558"},
+    {file = "websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6350b14a40c95ddd53e775dbdbbbc59b124a5c8ecd6fbb09c2e52029f7a9f480"},
+    {file = "websockets-12.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:70ec754cc2a769bcd218ed8d7209055667b30860ffecb8633a834dde27d6307c"},
+    {file = "websockets-12.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6e96f5ed1b83a8ddb07909b45bd94833b0710f738115751cdaa9da1fb0cb66e8"},
+    {file = "websockets-12.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4d87be612cbef86f994178d5186add3d94e9f31cc3cb499a0482b866ec477603"},
+    {file = "websockets-12.0-cp310-cp310-win32.whl", hash = "sha256:befe90632d66caaf72e8b2ed4d7f02b348913813c8b0a32fae1cc5fe3730902f"},
+    {file = "websockets-12.0-cp310-cp310-win_amd64.whl", hash = "sha256:363f57ca8bc8576195d0540c648aa58ac18cf85b76ad5202b9f976918f4219cf"},
+    {file = "websockets-12.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5d873c7de42dea355d73f170be0f23788cf3fa9f7bed718fd2830eefedce01b4"},
+    {file = "websockets-12.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3f61726cae9f65b872502ff3c1496abc93ffbe31b278455c418492016e2afc8f"},
+    {file = "websockets-12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ed2fcf7a07334c77fc8a230755c2209223a7cc44fc27597729b8ef5425aa61a3"},
+    {file = "websockets-12.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e332c210b14b57904869ca9f9bf4ca32f5427a03eeb625da9b616c85a3a506c"},
+    {file = "websockets-12.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5693ef74233122f8ebab026817b1b37fe25c411ecfca084b29bc7d6efc548f45"},
+    {file = "websockets-12.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e9e7db18b4539a29cc5ad8c8b252738a30e2b13f033c2d6e9d0549b45841c04"},
+    {file = "websockets-12.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6e2df67b8014767d0f785baa98393725739287684b9f8d8a1001eb2839031447"},
+    {file = "websockets-12.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:bea88d71630c5900690fcb03161ab18f8f244805c59e2e0dc4ffadae0a7ee0ca"},
+    {file = "websockets-12.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dff6cdf35e31d1315790149fee351f9e52978130cef6c87c4b6c9b3baf78bc53"},
+    {file = "websockets-12.0-cp311-cp311-win32.whl", hash = "sha256:3e3aa8c468af01d70332a382350ee95f6986db479ce7af14d5e81ec52aa2b402"},
+    {file = "websockets-12.0-cp311-cp311-win_amd64.whl", hash = "sha256:25eb766c8ad27da0f79420b2af4b85d29914ba0edf69f547cc4f06ca6f1d403b"},
+    {file = "websockets-12.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0e6e2711d5a8e6e482cacb927a49a3d432345dfe7dea8ace7b5790df5932e4df"},
+    {file = "websockets-12.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:dbcf72a37f0b3316e993e13ecf32f10c0e1259c28ffd0a85cee26e8549595fbc"},
+    {file = "websockets-12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:12743ab88ab2af1d17dd4acb4645677cb7063ef4db93abffbf164218a5d54c6b"},
+    {file = "websockets-12.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b645f491f3c48d3f8a00d1fce07445fab7347fec54a3e65f0725d730d5b99cb"},
+    {file = "websockets-12.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9893d1aa45a7f8b3bc4510f6ccf8db8c3b62120917af15e3de247f0780294b92"},
+    {file = "websockets-12.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f38a7b376117ef7aff996e737583172bdf535932c9ca021746573bce40165ed"},
+    {file = "websockets-12.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:f764ba54e33daf20e167915edc443b6f88956f37fb606449b4a5b10ba42235a5"},
+    {file = "websockets-12.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:1e4b3f8ea6a9cfa8be8484c9221ec0257508e3a1ec43c36acdefb2a9c3b00aa2"},
+    {file = "websockets-12.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9fdf06fd06c32205a07e47328ab49c40fc1407cdec801d698a7c41167ea45113"},
+    {file = "websockets-12.0-cp312-cp312-win32.whl", hash = "sha256:baa386875b70cbd81798fa9f71be689c1bf484f65fd6fb08d051a0ee4e79924d"},
+    {file = "websockets-12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ae0a5da8f35a5be197f328d4727dbcfafa53d1824fac3d96cdd3a642fe09394f"},
+    {file = "websockets-12.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5f6ffe2c6598f7f7207eef9a1228b6f5c818f9f4d53ee920aacd35cec8110438"},
+    {file = "websockets-12.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9edf3fc590cc2ec20dc9d7a45108b5bbaf21c0d89f9fd3fd1685e223771dc0b2"},
+    {file = "websockets-12.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8572132c7be52632201a35f5e08348137f658e5ffd21f51f94572ca6c05ea81d"},
+    {file = "websockets-12.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:604428d1b87edbf02b233e2c207d7d528460fa978f9e391bd8aaf9c8311de137"},
+    {file = "websockets-12.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1a9d160fd080c6285e202327aba140fc9a0d910b09e423afff4ae5cbbf1c7205"},
+    {file = "websockets-12.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87b4aafed34653e465eb77b7c93ef058516cb5acf3eb21e42f33928616172def"},
+    {file = "websockets-12.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b2ee7288b85959797970114deae81ab41b731f19ebcd3bd499ae9ca0e3f1d2c8"},
+    {file = "websockets-12.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:7fa3d25e81bfe6a89718e9791128398a50dec6d57faf23770787ff441d851967"},
+    {file = "websockets-12.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a571f035a47212288e3b3519944f6bf4ac7bc7553243e41eac50dd48552b6df7"},
+    {file = "websockets-12.0-cp38-cp38-win32.whl", hash = "sha256:3c6cc1360c10c17463aadd29dd3af332d4a1adaa8796f6b0e9f9df1fdb0bad62"},
+    {file = "websockets-12.0-cp38-cp38-win_amd64.whl", hash = "sha256:1bf386089178ea69d720f8db6199a0504a406209a0fc23e603b27b300fdd6892"},
+    {file = "websockets-12.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ab3d732ad50a4fbd04a4490ef08acd0517b6ae6b77eb967251f4c263011a990d"},
+    {file = "websockets-12.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a1d9697f3337a89691e3bd8dc56dea45a6f6d975f92e7d5f773bc715c15dde28"},
+    {file = "websockets-12.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1df2fbd2c8a98d38a66f5238484405b8d1d16f929bb7a33ed73e4801222a6f53"},
+    {file = "websockets-12.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23509452b3bc38e3a057382c2e941d5ac2e01e251acce7adc74011d7d8de434c"},
+    {file = "websockets-12.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2e5fc14ec6ea568200ea4ef46545073da81900a2b67b3e666f04adf53ad452ec"},
+    {file = "websockets-12.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46e71dbbd12850224243f5d2aeec90f0aaa0f2dde5aeeb8fc8df21e04d99eff9"},
+    {file = "websockets-12.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b81f90dcc6c85a9b7f29873beb56c94c85d6f0dac2ea8b60d995bd18bf3e2aae"},
+    {file = "websockets-12.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:a02413bc474feda2849c59ed2dfb2cddb4cd3d2f03a2fedec51d6e959d9b608b"},
+    {file = "websockets-12.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:bbe6013f9f791944ed31ca08b077e26249309639313fff132bfbf3ba105673b9"},
+    {file = "websockets-12.0-cp39-cp39-win32.whl", hash = "sha256:cbe83a6bbdf207ff0541de01e11904827540aa069293696dd528a6640bd6a5f6"},
+    {file = "websockets-12.0-cp39-cp39-win_amd64.whl", hash = "sha256:fc4e7fa5414512b481a2483775a8e8be7803a35b30ca805afa4998a84f9fd9e8"},
+    {file = "websockets-12.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:248d8e2446e13c1d4326e0a6a4e9629cb13a11195051a73acf414812700badbd"},
+    {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f44069528d45a933997a6fef143030d8ca8042f0dfaad753e2906398290e2870"},
+    {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c4e37d36f0d19f0a4413d3e18c0d03d0c268ada2061868c1e6f5ab1a6d575077"},
+    {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d829f975fc2e527a3ef2f9c8f25e553eb7bc779c6665e8e1d52aa22800bb38b"},
+    {file = "websockets-12.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:2c71bd45a777433dd9113847af751aae36e448bc6b8c361a566cb043eda6ec30"},
+    {file = "websockets-12.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0bee75f400895aef54157b36ed6d3b308fcab62e5260703add87f44cee9c82a6"},
+    {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:423fc1ed29f7512fceb727e2d2aecb952c46aa34895e9ed96071821309951123"},
+    {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27a5e9964ef509016759f2ef3f2c1e13f403725a5e6a1775555994966a66e931"},
+    {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3181df4583c4d3994d31fb235dc681d2aaad744fbdbf94c4802485ececdecf2"},
+    {file = "websockets-12.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b067cb952ce8bf40115f6c19f478dc71c5e719b7fbaa511359795dfd9d1a6468"},
+    {file = "websockets-12.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:00700340c6c7ab788f176d118775202aadea7602c5cc6be6ae127761c16d6b0b"},
+    {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e469d01137942849cff40517c97a30a93ae79917752b34029f0ec72df6b46399"},
+    {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffefa1374cd508d633646d51a8e9277763a9b78ae71324183693959cf94635a7"},
+    {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba0cab91b3956dfa9f512147860783a1829a8d905ee218a9837c18f683239611"},
+    {file = "websockets-12.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2cb388a5bfb56df4d9a406783b7f9dbefb888c09b71629351cc6b036e9259370"},
+    {file = "websockets-12.0-py3-none-any.whl", hash = "sha256:dc284bbc8d7c78a6c69e0c7325ab46ee5e40bb4d50e494d8131a07ef47500e9e"},
+    {file = "websockets-12.0.tar.gz", hash = "sha256:81df9cbcbb6c260de1e007e58c011bfebe2dafc8435107b0537f393dd38c8b1b"},
+]
+
 [[package]]
 name = "werkzeug"
 version = "3.0.1"
@@ -2572,16 +2652,6 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
-    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
-    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2819,4 +2889,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "af9d5b45310c12411bfe67cb9677d2236808d0780ca1bd81525d2763a928f7f9"
+content-hash = "df7161da4fdc3cba0a445176fc9dda2a0e8a53e13a7aa8a864385ca259381b41"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index d8112c8bf0..b3a5bf873e 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -59,7 +59,7 @@ rustls.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-sha2.workspace = true
+sha2 = { workspace = true, features = ["asm"] }
 smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 11af85caa4..bc307230dd 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -254,7 +254,7 @@ async fn authenticate_with_secret(
     config: &'static AuthenticationConfig,
 ) -> auth::Result<ComputeCredentials> {
     if let Some(password) = unauthenticated_password {
-        let auth_outcome = validate_password_and_exchange(&password, secret)?;
+        let auth_outcome = validate_password_and_exchange(&password, secret).await?;
         let keys = match auth_outcome {
             crate::sasl::Outcome::Success(key) => key,
             crate::sasl::Outcome::Failure(reason) => {
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 788381b6c0..f26dcb7c9a 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -126,7 +126,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
             .strip_suffix(&[0])
             .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
 
-        let outcome = validate_password_and_exchange(password, self.state.0)?;
+        let outcome = validate_password_and_exchange(password, self.state.0).await?;
 
         if let sasl::Outcome::Success(_) = &outcome {
             self.stream.write_message_noflush(&Be::AuthenticationOk)?;
@@ -180,7 +180,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
     }
 }
 
-pub(crate) fn validate_password_and_exchange(
+pub(crate) async fn validate_password_and_exchange(
     password: &[u8],
     secret: AuthSecret,
 ) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
@@ -200,7 +200,8 @@ pub(crate) fn validate_password_and_exchange(
                 &scram_secret,
                 sasl_client,
                 crate::config::TlsServerEndPoint::Undefined,
-            )?;
+            )
+            .await?;
 
             let client_key = match outcome {
                 sasl::Outcome::Success(client_key) => client_key,
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 3b2e0cc204..b36663518d 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -250,6 +250,8 @@ impl super::Api for Api {
         // which means that we might cache it to reduce the load and latency.
         if let Some(cached) = self.caches.node_info.get(&key) {
             info!(key = &*key, "found cached compute node info");
+            info!("cold_start_info=warm");
+            ctx.set_cold_start_info(ColdStartInfo::Warm);
             return Ok(cached);
         }
 
@@ -260,6 +262,7 @@ impl super::Api for Api {
         if permit.should_check_cache() {
             if let Some(cached) = self.caches.node_info.get(&key) {
                 info!(key = &*key, "found cached compute node info");
+                info!("cold_start_info=warm");
                 ctx.set_cold_start_info(ColdStartInfo::Warm);
                 return Ok(cached);
             }
diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index f476cb9b37..700c8c8681 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -341,7 +341,14 @@ impl Accept for ProxyProtocolAccept {
         cx: &mut Context<'_>,
     ) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
         let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?);
-        tracing::info!(protocol = self.protocol, "accepted new TCP connection");
+
+        let conn_id = uuid::Uuid::new_v4();
+        let span = tracing::info_span!("http_conn", ?conn_id);
+        {
+            let _enter = span.enter();
+            tracing::info!("accepted new TCP connection");
+        }
+
         let Some(conn) = conn else {
             return Poll::Ready(None);
         };
@@ -354,6 +361,7 @@ impl Accept for ProxyProtocolAccept {
                     .with_label_values(&[self.protocol])
                     .guard(),
             )),
+            span,
         })))
     }
 }
@@ -364,6 +372,14 @@ pin_project! {
         pub inner: T,
         pub connection_id: Uuid,
         pub gauge: Mutex<Option<IntCounterPairGuard>>,
+        pub span: tracing::Span,
+    }
+
+    impl<S> PinnedDrop for WithConnectionGuard<S> {
+        fn drop(this: Pin<&mut Self>) {
+            let _enter = this.span.enter();
+            tracing::info!("HTTP connection closed")
+        }
     }
 }
 
diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs
index a95e734d06..df4b3ec8d7 100644
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -113,7 +113,7 @@ mod tests {
         );
     }
 
-    fn run_round_trip_test(server_password: &str, client_password: &str) {
+    async fn run_round_trip_test(server_password: &str, client_password: &str) {
         let scram_secret = ServerSecret::build(server_password).unwrap();
         let sasl_client =
             ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported());
@@ -123,6 +123,7 @@ mod tests {
             sasl_client,
             crate::config::TlsServerEndPoint::Undefined,
         )
+        .await
         .unwrap();
 
         match outcome {
@@ -131,14 +132,14 @@ mod tests {
         }
     }
 
-    #[test]
-    fn round_trip() {
-        run_round_trip_test("pencil", "pencil")
+    #[tokio::test]
+    async fn round_trip() {
+        run_round_trip_test("pencil", "pencil").await
     }
 
-    #[test]
+    #[tokio::test]
     #[should_panic(expected = "password doesn't match")]
-    fn failure() {
-        run_round_trip_test("pencil", "eraser")
+    async fn failure() {
+        run_round_trip_test("pencil", "eraser").await
     }
 }
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index 9af7db5201..16575d5d98 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -71,7 +71,7 @@ impl<'a> Exchange<'a> {
     }
 }
 
-pub fn exchange(
+pub async fn exchange(
     secret: &ServerSecret,
     mut client: ScramSha256,
     tls_server_end_point: config::TlsServerEndPoint,
@@ -86,7 +86,14 @@ pub fn exchange(
         .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
     let sent = match init.transition(secret, &tls_server_end_point, client_first)? {
         Continue(sent, server_first) => {
-            client.update(server_first.as_bytes())?;
+            // `client.update` might perform `pbkdf2(pw)`, best to spawn it in a blocking thread.
+            // TODO(conrad): take this code from tokio-postgres and make an async-aware pbkdf2 impl
+            client = tokio::task::spawn_blocking(move || {
+                client.update(server_first.as_bytes())?;
+                Ok::<ScramSha256, std::io::Error>(client)
+            })
+            .await
+            .expect("should not panic while performing password hash")?;
             sent
         }
         Success(x, _) => match x {},
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 68f68eaba1..be9f90acde 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -19,6 +19,7 @@ use rand::SeedableRng;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;
+use tracing::instrument::Instrumented;
 
 use crate::context::RequestMonitoring;
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard};
@@ -30,13 +31,12 @@ use hyper::{
     Body, Method, Request, Response,
 };
 
-use std::convert::Infallible;
 use std::net::IpAddr;
 use std::sync::Arc;
 use std::task::Poll;
 use tls_listener::TlsListener;
 use tokio::net::TcpListener;
-use tokio_util::sync::CancellationToken;
+use tokio_util::sync::{CancellationToken, DropGuard};
 use tracing::{error, info, warn, Instrument};
 use utils::http::{error::ApiError, json::json_response};
 
@@ -100,12 +100,7 @@ pub async fn task_main(
     let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
     ws_connections.close(); // allows `ws_connections.wait to complete`
 
-    let tls_listener = TlsListener::new(
-        tls_acceptor,
-        addr_incoming,
-        "http",
-        config.handshake_timeout,
-    );
+    let tls_listener = TlsListener::new(tls_acceptor, addr_incoming, config.handshake_timeout);
 
     let make_svc = hyper::service::make_service_fn(
         |stream: &tokio_rustls::server::TlsStream<
@@ -121,6 +116,11 @@ pub async fn task_main(
                 .take()
                 .expect("gauge should be set on connection start");
 
+            // Cancel all current inflight HTTP requests if the HTTP connection is closed.
+            let http_cancellation_token = CancellationToken::new();
+            let cancel_connection = http_cancellation_token.clone().drop_guard();
+
+            let span = conn.span.clone();
             let client_addr = conn.inner.client_addr();
             let remote_addr = conn.inner.inner.remote_addr();
             let backend = backend.clone();
@@ -136,27 +136,43 @@ pub async fn task_main(
                 Ok(MetricService::new(
                     hyper::service::service_fn(move |req: Request<Body>| {
                         let backend = backend.clone();
-                        let ws_connections = ws_connections.clone();
+                        let ws_connections2 = ws_connections.clone();
                         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
                         let cancellation_handler = cancellation_handler.clone();
+                        let http_cancellation_token = http_cancellation_token.child_token();
 
-                        async move {
-                            Ok::<_, Infallible>(
-                                request_handler(
+                        // `request_handler` is not cancel safe. It expects to be cancelled only at specific times.
+                        // By spawning the future, we ensure it never gets cancelled until it decides to.
+                        ws_connections.spawn(
+                            async move {
+                                // Cancel the current inflight HTTP request if the requets stream is closed.
+                                // This is slightly different to `_cancel_connection` in that
+                                // h2 can cancel individual requests with a `RST_STREAM`.
+                                let _cancel_session = http_cancellation_token.clone().drop_guard();
+
+                                let res = request_handler(
                                     req,
                                     config,
                                     backend,
-                                    ws_connections,
+                                    ws_connections2,
                                     cancellation_handler,
                                     peer_addr.ip(),
                                     endpoint_rate_limiter,
+                                    http_cancellation_token,
                                 )
                                 .await
-                                .map_or_else(|e| e.into_response(), |r| r),
-                            )
-                        }
+                                .map_or_else(|e| e.into_response(), |r| r);
+
+                                _cancel_session.disarm();
+
+                                res
+                            }
+                            .in_current_span(),
+                        )
                     }),
                     gauge,
+                    cancel_connection,
+                    span,
                 ))
             }
         },
@@ -176,11 +192,23 @@ pub async fn task_main(
 struct MetricService<S> {
     inner: S,
     _gauge: IntCounterPairGuard,
+    _cancel: DropGuard,
+    span: tracing::Span,
 }
 
 impl<S> MetricService<S> {
-    fn new(inner: S, _gauge: IntCounterPairGuard) -> MetricService<S> {
-        MetricService { inner, _gauge }
+    fn new(
+        inner: S,
+        _gauge: IntCounterPairGuard,
+        _cancel: DropGuard,
+        span: tracing::Span,
+    ) -> MetricService<S> {
+        MetricService {
+            inner,
+            _gauge,
+            _cancel,
+            span,
+        }
     }
 }
 
@@ -190,14 +218,16 @@ where
 {
     type Response = S::Response;
     type Error = S::Error;
-    type Future = S::Future;
+    type Future = Instrumented<S::Future>;
 
     fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
         self.inner.poll_ready(cx)
     }
 
     fn call(&mut self, req: Request<ReqBody>) -> Self::Future {
-        self.inner.call(req)
+        self.span
+            .in_scope(|| self.inner.call(req))
+            .instrument(self.span.clone())
     }
 }
 
@@ -210,6 +240,8 @@ async fn request_handler(
     cancellation_handler: Arc<CancellationHandler>,
     peer_addr: IpAddr,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    // used to cancel in-flight HTTP requests. not used to cancel websockets
+    http_cancellation_token: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
     let session_id = uuid::Uuid::new_v4();
 
@@ -253,7 +285,7 @@ async fn request_handler(
         let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
         let span = ctx.span.clone();
 
-        sql_over_http::handle(config, ctx, request, backend)
+        sql_over_http::handle(config, ctx, request, backend, http_cancellation_token)
             .instrument(span)
             .await
     } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 9b3ca8d447..72b55c45f0 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -50,7 +50,7 @@ impl PoolingBackend {
             }
         };
         let auth_outcome =
-            crate::auth::validate_password_and_exchange(&conn_info.password, secret)?;
+            crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?;
         let res = match auth_outcome {
             crate::sasl::Outcome::Success(key) => Ok(key),
             crate::sasl::Outcome::Failure(reason) => {
@@ -84,6 +84,7 @@ impl PoolingBackend {
         };
 
         if let Some(client) = maybe_client {
+            info!("cold_start_info=warm");
             ctx.set_cold_start_info(ColdStartInfo::Warm);
             return Ok(client);
         }
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 86c278030f..f675375ff1 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -217,8 +217,8 @@ pub async fn handle(
     mut ctx: RequestMonitoring,
     request: Request<Body>,
     backend: Arc<PoolingBackend>,
+    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let cancel = CancellationToken::new();
     let cancel2 = cancel.clone();
     let handle = tokio::spawn(async move {
         time::sleep(config.http_config.request_timeout).await;
diff --git a/proxy/src/serverless/tls_listener.rs b/proxy/src/serverless/tls_listener.rs
index cce02e3850..33f194dd59 100644
--- a/proxy/src/serverless/tls_listener.rs
+++ b/proxy/src/serverless/tls_listener.rs
@@ -13,7 +13,7 @@ use tokio::{
     time::timeout,
 };
 use tokio_rustls::{server::TlsStream, TlsAcceptor};
-use tracing::{info, warn};
+use tracing::{info, warn, Instrument};
 
 use crate::{
     metrics::TLS_HANDSHAKE_FAILURES,
@@ -29,24 +29,17 @@ pin_project! {
         tls: TlsAcceptor,
         waiting: JoinSet<Option<TlsStream<A::Conn>>>,
         timeout: Duration,
-        protocol: &'static str,
     }
 }
 
 impl<A: Accept> TlsListener<A> {
     /// Create a `TlsListener` with default options.
-    pub(crate) fn new(
-        tls: TlsAcceptor,
-        listener: A,
-        protocol: &'static str,
-        timeout: Duration,
-    ) -> Self {
+    pub(crate) fn new(tls: TlsAcceptor, listener: A, timeout: Duration) -> Self {
         TlsListener {
             listener,
             tls,
             waiting: JoinSet::new(),
             timeout,
-            protocol,
         }
     }
 }
@@ -73,7 +66,7 @@ where
                 Poll::Ready(Some(Ok(mut conn))) => {
                     let t = *this.timeout;
                     let tls = this.tls.clone();
-                    let protocol = *this.protocol;
+                    let span = conn.span.clone();
                     this.waiting.spawn(async move {
                         let peer_addr = match conn.inner.wait_for_addr().await {
                             Ok(Some(addr)) => addr,
@@ -86,21 +79,24 @@ where
 
                         let accept = tls.accept(conn);
                         match timeout(t, accept).await {
-                            Ok(Ok(conn)) => Some(conn),
+                            Ok(Ok(conn)) => {
+                                info!(%peer_addr, "accepted new TLS connection");
+                                Some(conn)
+                            },
                             // The handshake failed, try getting another connection from the queue
                             Ok(Err(e)) => {
                                 TLS_HANDSHAKE_FAILURES.inc();
-                                warn!(%peer_addr, protocol, "failed to accept TLS connection: {e:?}");
+                                warn!(%peer_addr, "failed to accept TLS connection: {e:?}");
                                 None
                             }
                             // The handshake timed out, try getting another connection from the queue
                             Err(_) => {
                                 TLS_HANDSHAKE_FAILURES.inc();
-                                warn!(%peer_addr, protocol, "failed to accept TLS connection: timeout");
+                                warn!(%peer_addr, "failed to accept TLS connection: timeout");
                                 None
                             }
                         }
-                    });
+                    }.instrument(span));
                 }
                 Poll::Ready(Some(Err(e))) => {
                     tracing::error!("error accepting TCP connection: {e}");
@@ -112,10 +108,7 @@ where
 
         loop {
             return match this.waiting.poll_join_next(cx) {
-                Poll::Ready(Some(Ok(Some(conn)))) => {
-                    info!(protocol = this.protocol, "accepted new TLS connection");
-                    Poll::Ready(Some(Ok(conn)))
-                }
+                Poll::Ready(Some(Ok(Some(conn)))) => Poll::Ready(Some(Ok(conn))),
                 // The handshake failed to complete, try getting another connection from the queue
                 Poll::Ready(Some(Ok(None))) => continue,
                 // The handshake panicked or was cancelled. ignore and get another connection
diff --git a/pyproject.toml b/pyproject.toml
index 6dff112a5e..e347d47cbf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,6 +40,7 @@ pytest-split = "^0.8.1"
 zstandard = "^0.21.0"
 httpx = {extras = ["http2"], version = "^0.26.0"}
 pytest-repeat = "^0.9.3"
+websockets = "^12.0"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs
index 5c79e9082b..42340ba1df 100644
--- a/safekeeper/tests/walproposer_sim/walproposer_api.rs
+++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs
@@ -224,6 +224,16 @@ impl SimulationApi {
             })
             .collect::<Vec<_>>();
 
+        let empty_feedback = PageserverFeedback {
+            present: false,
+            currentClusterSize: 0,
+            last_received_lsn: 0,
+            disk_consistent_lsn: 0,
+            remote_consistent_lsn: 0,
+            replytime: 0,
+            shard_number: 0,
+        };
+
         Self {
             os: args.os,
             safekeepers: RefCell::new(sk_conns),
@@ -232,15 +242,11 @@ impl SimulationApi {
             last_logged_commit_lsn: 0,
             shmem: UnsafeCell::new(walproposer::bindings::WalproposerShmemState {
                 mutex: 0,
-                feedback: PageserverFeedback {
-                    currentClusterSize: 0,
-                    last_received_lsn: 0,
-                    disk_consistent_lsn: 0,
-                    remote_consistent_lsn: 0,
-                    replytime: 0,
-                },
                 mineLastElectedTerm: 0,
                 backpressureThrottlingTime: pg_atomic_uint64 { value: 0 },
+                shard_ps_feedback: [empty_feedback; 128],
+                num_shards: 0,
+                min_ps_feedback: empty_feedback,
             }),
             config: args.config,
             event_set: RefCell::new(None),
@@ -598,7 +604,11 @@ impl ApiImpl for SimulationApi {
         }
     }
 
-    fn process_safekeeper_feedback(&mut self, wp: &mut walproposer::bindings::WalProposer) {
+    fn process_safekeeper_feedback(
+        &mut self,
+        wp: &mut walproposer::bindings::WalProposer,
+        _sk: &mut walproposer::bindings::Safekeeper,
+    ) {
         debug!("process_safekeeper_feedback, commit_lsn={}", wp.commitLsn);
         if wp.commitLsn > self.last_logged_commit_lsn {
             self.os.log_event(format!("commit_lsn;{}", wp.commitLsn));
diff --git a/scripts/ps_ec2_setup_instance_store b/scripts/ps_ec2_setup_instance_store
index 4cca3a9857..1f88f252eb 100755
--- a/scripts/ps_ec2_setup_instance_store
+++ b/scripts/ps_ec2_setup_instance_store
@@ -40,7 +40,7 @@ To run your local neon.git build on the instance store volume,
 run the following commands from the top of the neon.git checkout
 
     # raise file descriptor limit of your shell and its child processes
-    sudo prlimit -p $$ --nofile=800000:800000
+    sudo prlimit -p \$\$ --nofile=800000:800000
 
     # test suite run
     export TEST_OUTPUT="$TEST_OUTPUT"
diff --git a/test_runner/conftest.py b/test_runner/conftest.py
index 200c9c3740..4b0c9ac71d 100644
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -2,6 +2,7 @@ pytest_plugins = (
     "fixtures.pg_version",
     "fixtures.parametrize",
     "fixtures.httpserver",
+    "fixtures.compute_reconfigure",
     "fixtures.neon_fixtures",
     "fixtures.benchmark_fixture",
     "fixtures.pg_stats",
diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py
new file mode 100644
index 0000000000..9dd66fe636
--- /dev/null
+++ b/test_runner/fixtures/compute_reconfigure.py
@@ -0,0 +1,62 @@
+import concurrent.futures
+from typing import Any
+
+import pytest
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
+
+from fixtures.log_helper import log
+from fixtures.types import TenantId
+
+
+class ComputeReconfigure:
+    def __init__(self, server):
+        self.server = server
+        self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach"
+        self.workloads = {}
+
+    def register_workload(self, workload):
+        self.workloads[workload.tenant_id] = workload
+
+
+@pytest.fixture(scope="function")
+def compute_reconfigure_listener(make_httpserver):
+    """
+    This fixture exposes an HTTP listener for the storage controller to submit
+    compute notifications to us, instead of updating neon_local endpoints itself.
+
+    Although storage controller can use neon_local directly, this causes problems when
+    the test is also concurrently modifying endpoints.  Instead, configure storage controller
+    to send notifications up to this test code, which will route all endpoint updates
+    through Workload, which has a mutex to make concurrent updates safe.
+    """
+    server = make_httpserver
+
+    self = ComputeReconfigure(server)
+
+    # Do neon_local endpoint reconfiguration in the background so that we can
+    # accept a healthy rate of calls into notify-attach.
+    reconfigure_threads = concurrent.futures.ThreadPoolExecutor(max_workers=1)
+
+    def handler(request: Request):
+        assert request.json is not None
+        body: dict[str, Any] = request.json
+        log.info(f"notify-attach request: {body}")
+
+        try:
+            workload = self.workloads[TenantId(body["tenant_id"])]
+        except KeyError:
+            pass
+        else:
+            # This causes the endpoint to query storage controller for its location, which
+            # is redundant since we already have it here, but this avoids extending the
+            # neon_local CLI to take full lists of locations
+            reconfigure_threads.submit(lambda workload=workload: workload.reconfigure())  # type: ignore[no-any-return]
+
+        return Response(status=200)
+
+    self.server.expect_request("/notify-attach", method="PUT").respond_with_handler(handler)
+
+    yield self
+    reconfigure_threads.shutdown()
+    server.clear()
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b3f460c7fe..1d30c45278 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -51,7 +51,7 @@ from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pageserver.allowed_errors import (
     DEFAULT_PAGESERVER_ALLOWED_ERRORS,
-    scan_pageserver_log_for_errors,
+    DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.types import IndexPartDump
@@ -77,6 +77,7 @@ from fixtures.utils import (
     ATTACHMENT_NAME_REGEX,
     allure_add_grafana_links,
     allure_attach_from_dir,
+    assert_no_errors,
     get_self_dir,
     subprocess_capture,
     wait_until,
@@ -944,6 +945,8 @@ class NeonEnvBuilder:
             for pageserver in self.env.pageservers:
                 pageserver.assert_no_errors()
 
+            self.env.storage_controller.assert_no_errors()
+
         try:
             self.overlay_cleanup_teardown()
         except Exception as e:
@@ -1525,6 +1528,7 @@ class NeonCli(AbstractNeonCli):
         conf: Optional[Dict[str, Any]] = None,
         shard_count: Optional[int] = None,
         shard_stripe_size: Optional[int] = None,
+        placement_policy: Optional[str] = None,
         set_default: bool = False,
     ) -> Tuple[TenantId, TimelineId]:
         """
@@ -1558,6 +1562,9 @@ class NeonCli(AbstractNeonCli):
         if shard_stripe_size is not None:
             args.extend(["--shard-stripe-size", str(shard_stripe_size)])
 
+        if placement_policy is not None:
+            args.extend(["--placement-policy", str(placement_policy)])
+
         res = self.raw_cli(args)
         res.check_returncode()
         return tenant_id, timeline_id
@@ -1885,19 +1892,6 @@ class NeonCli(AbstractNeonCli):
 
         return self.raw_cli(args, check_return_code=True)
 
-    def tenant_migrate(
-        self, tenant_shard_id: TenantShardId, new_pageserver: int, timeout_secs: Optional[int]
-    ):
-        args = [
-            "tenant",
-            "migrate",
-            "--tenant-id",
-            str(tenant_shard_id),
-            "--id",
-            str(new_pageserver),
-        ]
-        return self.raw_cli(args, check_return_code=True, timeout=timeout_secs)
-
     def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]":
         return self.raw_cli(["start"], check_return_code=check_return_code)
 
@@ -1957,6 +1951,7 @@ class NeonStorageController(MetricsGetter):
         self.env = env
         self.running = False
         self.auth_enabled = auth_enabled
+        self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
 
     def start(self):
         assert not self.running
@@ -1981,6 +1976,11 @@ class NeonStorageController(MetricsGetter):
                 msg = ""
             raise StorageControllerApiException(msg, res.status_code) from e
 
+    def assert_no_errors(self):
+        assert_no_errors(
+            self.env.repo_dir / "storage_controller.log", "storage_controller", self.allowed_errors
+        )
+
     def pageserver_api(self) -> PageserverHttpClient:
         """
         The storage controller implements a subset of the pageserver REST API, for mapping
@@ -2088,6 +2088,14 @@ class NeonStorageController(MetricsGetter):
         )
         return response.json()
 
+    def tenant_list(self):
+        response = self.request(
+            "GET",
+            f"{self.env.storage_controller_api}/debug/v1/tenant",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        return response.json()
+
     def node_configure(self, node_id, body: dict[str, Any]):
         log.info(f"node_configure({node_id}, {body})")
         body["node_id"] = node_id
@@ -2135,7 +2143,7 @@ class NeonStorageController(MetricsGetter):
         """
         response = self.request(
             "GET",
-            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/locate",
+            f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/locate",
             headers=self.headers(TokenScope.ADMIN),
         )
         body = response.json()
@@ -2177,6 +2185,23 @@ class NeonStorageController(MetricsGetter):
         )
         log.info("storage controller passed consistency check")
 
+    def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
+        if isinstance(config_strings, tuple):
+            pairs = [config_strings]
+        else:
+            pairs = config_strings
+
+        log.info(f"Requesting config failpoints: {repr(pairs)}")
+
+        res = self.request(
+            "PUT",
+            f"{self.env.storage_controller_api}/debug/v1/failpoints",
+            json=[{"name": name, "actions": actions} for name, actions in pairs],
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        log.info(f"Got failpoints request response code {res.status_code}")
+        res.raise_for_status()
+
     def __enter__(self) -> "NeonStorageController":
         return self
 
@@ -2328,18 +2353,9 @@ class NeonPageserver(PgProtocol):
         return self.env.repo_dir / f"pageserver_{self.id}"
 
     def assert_no_errors(self):
-        logfile = self.workdir / "pageserver.log"
-        if not logfile.exists():
-            log.warning(f"Skipping log check: {logfile} does not exist")
-            return
-
-        with logfile.open("r") as f:
-            errors = scan_pageserver_log_for_errors(f, self.allowed_errors)
-
-        for _lineno, error in errors:
-            log.info(f"not allowed error: {error.strip()}")
-
-        assert not errors
+        assert_no_errors(
+            self.workdir / "pageserver.log", f"pageserver_{self.id}", self.allowed_errors
+        )
 
     def assert_no_metric_errors(self):
         """
@@ -2944,6 +2960,7 @@ class NeonProxy(PgProtocol):
         user = quote(kwargs["user"])
         password = quote(kwargs["password"])
         expected_code = kwargs.get("expected_code")
+        timeout = kwargs.get("timeout")
 
         log.info(f"Executing http query: {query}")
 
@@ -2957,6 +2974,7 @@ class NeonProxy(PgProtocol):
                 "Neon-Pool-Opt-In": "true",
             },
             verify=str(self.test_output_dir / "proxy.crt"),
+            timeout=timeout,
         )
 
         if expected_code is not None:
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 8ff4341cc0..ec0f81b380 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -55,7 +55,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     # FIXME: These need investigation
     ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
     ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
-    ".*Removing intermediate uninit mark file.*",
     # Tenant::delete_timeline() can cause any of the four following errors.
     # FIXME: we shouldn't be considering it an error: https://github.com/neondatabase/neon/issues/2946
     ".*could not flush frozen layer.*queue is in state Stopped",  # when schedule layer upload fails because queued got closed before compaction got killed
@@ -90,6 +89,16 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
 )
 
 
+DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
+    # Many tests will take pageservers offline, resulting in log warnings on the controller
+    # failing to connect to them.
+    ".*Call to node.*management API.*failed.*receive body.*",
+    ".*Call to node.*management API.*failed.*ReceiveBody.*",
+    # Many tests will start up with a node offline
+    ".*startup_reconcile: Could not scan node.*",
+]
+
+
 def _check_allowed_errors(input):
     allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)
 
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 6e082374d7..99ec894106 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -34,7 +34,7 @@ class TimelineCreate406(PageserverApiException):
 class TimelineCreate409(PageserverApiException):
     def __init__(self, res: requests.Response):
         assert res.status_code == 409
-        super().__init__("", res.status_code)
+        super().__init__(res.json()["msg"], res.status_code)
 
 
 @dataclass
@@ -357,9 +357,15 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload")
         self.verbose_error(res)
 
-    def tenant_secondary_download(self, tenant_id: Union[TenantId, TenantShardId]):
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download")
+    def tenant_secondary_download(
+        self, tenant_id: Union[TenantId, TenantShardId], wait_ms: Optional[int] = None
+    ) -> tuple[int, dict[Any, Any]]:
+        url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download"
+        if wait_ms is not None:
+            url = url + f"?wait_ms={wait_ms}"
+        res = self.post(url)
         self.verbose_error(res)
+        return (res.status_code, res.json())
 
     def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]):
         assert "tenant_id" not in config.keys()
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index b28da83508..c8ab550ad7 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -28,7 +28,7 @@ def platform() -> Optional[str]:
 
 @pytest.fixture(scope="function", autouse=True)
 def pageserver_virtual_file_io_engine() -> Optional[str]:
-    return None
+    return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE")
 
 
 def pytest_generate_tests(metafunc: Metafunc):
diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py
index ea648e460d..80c9b9ce9a 100644
--- a/test_runner/fixtures/types.py
+++ b/test_runner/fixtures/types.py
@@ -158,6 +158,9 @@ class TenantShardId:
     def __str__(self):
         return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}"
 
+    def __repr__(self):
+        return self.__str__()
+
     def _tuple(self) -> tuple[TenantId, int, int]:
         return (self.tenant_id, self.shard_number, self.shard_count)
 
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 7fc3bae3af..9365d65fc9 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -11,6 +11,7 @@ from typing import (
     Any,
     Callable,
     Dict,
+    Iterable,
     List,
     Optional,
     Tuple,
@@ -447,3 +448,39 @@ def humantime_to_ms(humantime: str) -> float:
             )
 
     return round(total_ms, 3)
+
+
+def scan_log_for_errors(input: Iterable[str], allowed_errors: List[str]) -> List[Tuple[int, str]]:
+    error_or_warn = re.compile(r"\s(ERROR|WARN)")
+    errors = []
+    for lineno, line in enumerate(input, start=1):
+        if len(line) == 0:
+            continue
+
+        if error_or_warn.search(line):
+            # Is this a torn log line?  This happens when force-killing a process and restarting
+            # Example: "2023-10-25T09:38:31.752314Z  WARN deletion executo2023-10-25T09:38:31.875947Z  INFO version: git-env:0f9452f76e8ccdfc88291bccb3f53e3016f40192"
+            if re.match("\\d{4}-\\d{2}-\\d{2}T.+\\d{4}-\\d{2}-\\d{2}T.+INFO version.+", line):
+                continue
+
+            # It's an ERROR or WARN. Is it in the allow-list?
+            for a in allowed_errors:
+                if re.match(a, line):
+                    break
+            else:
+                errors.append((lineno, line))
+    return errors
+
+
+def assert_no_errors(log_file, service, allowed_errors):
+    if not log_file.exists():
+        log.warning(f"Skipping {service} log check: {log_file} does not exist")
+        return
+
+    with log_file.open("r") as f:
+        errors = scan_log_for_errors(f, allowed_errors)
+
+    for _lineno, error in errors:
+        log.info(f"not allowed {service} error: {error.strip()}")
+
+    assert not errors, f"Log errors on {service}: {errors[0]}"
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index 1d5394dc1d..ab8717de54 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -1,4 +1,5 @@
-from typing import Optional
+import threading
+from typing import Any, Optional
 
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
@@ -11,6 +12,10 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
 from fixtures.types import TenantId, TimelineId
 
+# neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex
+# to ensure we don't do that: this enables running lots of Workloads in parallel safely.
+ENDPOINT_LOCK = threading.Lock()
+
 
 class Workload:
     """
@@ -27,6 +32,7 @@ class Workload:
         tenant_id: TenantId,
         timeline_id: TimelineId,
         branch_name: Optional[str] = None,
+        endpoint_opts: Optional[dict[str, Any]] = None,
     ):
         self.env = env
         self.tenant_id = tenant_id
@@ -40,18 +46,33 @@ class Workload:
         self.churn_cursor = 0
 
         self._endpoint: Optional[Endpoint] = None
+        self._endpoint_opts = endpoint_opts or {}
+
+    def reconfigure(self):
+        """
+        Request the endpoint to reconfigure based on location reported by storage controller
+        """
+        if self._endpoint is not None:
+            with ENDPOINT_LOCK:
+                self._endpoint.reconfigure()
 
     def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint:
-        if self._endpoint is None:
-            self._endpoint = self.env.endpoints.create(
-                self.branch_name,
-                tenant_id=self.tenant_id,
-                pageserver_id=pageserver_id,
-                endpoint_id="ep-workload",
-            )
-            self._endpoint.start(pageserver_id=pageserver_id)
-        else:
-            self._endpoint.reconfigure(pageserver_id=pageserver_id)
+        # We may be running alongside other Workloads for different tenants.  Full TTID is
+        # obnoxiously long for use here, but a cut-down version is still unique enough for tests.
+        endpoint_id = f"ep-workload-{str(self.tenant_id)[0:4]}-{str(self.timeline_id)[0:4]}"
+
+        with ENDPOINT_LOCK:
+            if self._endpoint is None:
+                self._endpoint = self.env.endpoints.create(
+                    self.branch_name,
+                    tenant_id=self.tenant_id,
+                    pageserver_id=pageserver_id,
+                    endpoint_id=endpoint_id,
+                    **self._endpoint_opts,
+                )
+                self._endpoint.start(pageserver_id=pageserver_id)
+            else:
+                self._endpoint.reconfigure(pageserver_id=pageserver_id)
 
         connstring = self._endpoint.safe_psql(
             "SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'"
@@ -94,7 +115,7 @@ class Workload:
         else:
             return False
 
-    def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True):
+    def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True, ingest=True):
         assert self.expect_rows >= n
 
         max_iters = 10
@@ -132,22 +153,28 @@ class Workload:
                 ]
             )
 
-        for tenant_shard_id, pageserver in tenant_get_shards(
-            self.env, self.tenant_id, pageserver_id
-        ):
-            last_flush_lsn = wait_for_last_flush_lsn(
-                self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
-            )
-            ps_http = pageserver.http_client()
-            wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
+        if ingest:
+            # Wait for written data to be ingested by the pageserver
+            for tenant_shard_id, pageserver in tenant_get_shards(
+                self.env, self.tenant_id, pageserver_id
+            ):
+                last_flush_lsn = wait_for_last_flush_lsn(
+                    self.env,
+                    endpoint,
+                    self.tenant_id,
+                    self.timeline_id,
+                    pageserver_id=pageserver_id,
+                )
+                ps_http = pageserver.http_client()
+                wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
 
-            if upload:
-                # force a checkpoint to trigger upload
-                ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id)
-                wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
-                log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
-            else:
-                log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
+                if upload:
+                    # Wait for written data to be uploaded to S3 (force a checkpoint to trigger upload)
+                    ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id)
+                    wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
+                    log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
+                else:
+                    log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
 
     def validate(self, pageserver_id: Optional[int] = None):
         endpoint = self.endpoint(pageserver_id)
diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
index c98fa44b1a..324ef0d516 100644
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -1,6 +1,5 @@
 import asyncio
 import json
-import os
 from pathlib import Path
 from typing import Any, Dict, Tuple
 
@@ -20,10 +19,6 @@ from performance.pageserver.util import (
 @pytest.mark.parametrize("n_tenants", [10])
 @pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"])
 @pytest.mark.timeout(1000)
-@pytest.mark.skipif(
-    os.getenv("CI", "false") == "true",
-    reason="The test if flaky on CI: https://github.com/neondatabase/neon/issues/7006",
-)
 def test_basebackup_with_high_slru_count(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index bdc944f352..ddd02238ea 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -120,12 +120,12 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
     env = neon_simple_env
     pageserver_http_client = env.pageserver.http_client()
 
-    env.pageserver.allowed_errors.extend(
-        [
-            ".*invalid branch start lsn: less than latest GC cutoff.*",
-            ".*invalid branch start lsn: less than planned GC cutoff.*",
-        ]
-    )
+    error_regexes = [
+        ".*invalid branch start lsn: less than latest GC cutoff.*",
+        ".*invalid branch start lsn: less than planned GC cutoff.*",
+    ]
+    env.pageserver.allowed_errors.extend(error_regexes)
+    env.storage_controller.allowed_errors.extend(error_regexes)
 
     # Disable background GC but set the `pitr_interval` to be small, so GC can delete something
     tenant, _ = env.neon_cli.create_tenant(
diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py
index 46c74a26b8..b79cad979f 100644
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -14,9 +14,12 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
     env = neon_env_builder.init_start()
 
-    env.pageserver.allowed_errors.extend(
-        [".*invalid branch start lsn.*", ".*invalid start lsn .* for ancestor timeline.*"]
-    )
+    error_regexes = [
+        ".*invalid branch start lsn.*",
+        ".*invalid start lsn .* for ancestor timeline.*",
+    ]
+    env.pageserver.allowed_errors.extend(error_regexes)
+    env.storage_controller.allowed_errors.extend(error_regexes)
 
     # Branch at the point where only 100 rows were inserted
     branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind")
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 9a0b91b54e..2a7a3c41ac 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -347,6 +347,64 @@ def test_non_uploaded_branch_is_deleted_after_restart(neon_env_builder: NeonEnvB
         ps_http.timeline_detail(env.initial_tenant, branch_id)
 
 
+def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_configs()
+    env.start()
+    env.pageserver.tenant_create(env.initial_tenant)
+
+    success_timeline = TimelineId.generate()
+    log.info(f"Creating timeline {success_timeline}")
+    ps_http = env.pageserver.http_client()
+    success_result = ps_http.timeline_create(
+        env.pg_version, env.initial_tenant, success_timeline, timeout=60
+    )
+
+    ps_http.configure_failpoints(("timeline-creation-after-uninit", "pause"))
+
+    def start_creating_timeline():
+        log.info(f"Creating (expect failure) timeline {env.initial_timeline}")
+        with pytest.raises(RequestException):
+            ps_http.timeline_create(
+                env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
+            )
+
+    t = threading.Thread(target=start_creating_timeline)
+    try:
+        t.start()
+
+        wait_until_paused(env, "timeline-creation-after-uninit")
+
+        # While timeline creation is in progress, trying to create a timeline
+        # again with the same ID should return 409
+        with pytest.raises(
+            PageserverApiException, match="creation of timeline with the given ID is in progress"
+        ):
+            ps_http.timeline_create(
+                env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
+            )
+
+        # Creation of a timeline already successfully created is idempotent, and is not impeded by some
+        # other timeline creation with a different TimelineId being stuck.
+        repeat_result = ps_http.timeline_create(
+            env.pg_version, env.initial_tenant, success_timeline, timeout=60
+        )
+        assert repeat_result == success_result
+    finally:
+        env.pageserver.stop(immediate=True)
+        t.join()
+
+    # now without a failpoint
+    env.pageserver.start()
+
+    wait_until_tenant_active(ps_http, env.initial_tenant)
+
+    with pytest.raises(PageserverApiException, match="not found"):
+        ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
+
+    # The one successfully created timeline should still be there.
+    assert len(ps_http.timeline_list(tenant_id=env.initial_tenant)) == 1
+
+
 def wait_until_paused(env: NeonEnv, failpoint: str):
     found = False
     msg = f"at failpoint {failpoint}"
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index b046ed7f1b..804ad135ce 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -204,7 +204,7 @@ def test_timeline_init_break_before_checkpoint_recreate(
     assert timeline_id == new_timeline_id
 
 
-def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilder):
+def test_timeline_create_break_after_dir_creation(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
 
@@ -214,9 +214,9 @@ def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilde
     old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
     initial_timeline_dirs = [d for d in timelines_dir.iterdir()]
 
-    # Introduce failpoint when creating a new timeline uninit mark, before any other files were created
-    pageserver_http.configure_failpoints(("after-timeline-uninit-mark-creation", "return"))
-    with pytest.raises(Exception, match="after-timeline-uninit-mark-creation"):
+    # Introduce failpoint when creating a new timeline, right after creating its directory
+    pageserver_http.configure_failpoints(("after-timeline-dir-creation", "return"))
+    with pytest.raises(Exception, match="after-timeline-dir-creation"):
         _ = pageserver_http.timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate())
 
     # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 5f815d3e6c..e0bb4c2062 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -238,6 +238,10 @@ def test_forward_compatibility(
             pg_distrib_dir=compatibility_postgres_distrib_dir,
         )
 
+        # TODO: remove this workaround after release-5090 is no longer the most recent release.
+        # There was a bug in that code that generates a warning in the storage controller log.
+        env.storage_controller.allowed_errors.append(".*no tenant_shard_id specified.*")
+
         # Use current neon_local even though we're using old binaries for
         # everything else: our test code is written for latest CLI args.
         env.neon_local_binpath = neon_local_binpath
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index ec57860033..132427ba2d 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -90,7 +90,6 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
         [
             ".*error importing base backup .*",
             ".*Timeline got dropped without initializing, cleaning its files.*",
-            ".*Removing intermediate uninit mark file.*",
             ".*InternalServerError.*timeline not found.*",
             ".*InternalServerError.*Tenant .* not found.*",
             ".*InternalServerError.*Timeline .* not found.*",
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index 877deee08f..81aed704bb 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -37,23 +37,18 @@ def test_pageserver_init_node_id(
     assert (
         bad_init.returncode == 1
     ), "pageserver should not be able to init new config without the node id"
-    assert "missing id" in bad_init.stderr
+    assert 'missing config value "id"' in bad_init.stderr
     assert not pageserver_config.exists(), "config file should not be created after init error"
 
-    completed_init = run_pageserver(
-        ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']
-    )
+    good_init_cmd = ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']
+    completed_init = run_pageserver(good_init_cmd)
     assert (
         completed_init.returncode == 0
     ), "pageserver should be able to create a new config with the node id given"
     assert pageserver_config.exists(), "config file should be created successfully"
 
-    bad_reinit = run_pageserver(
-        ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']
-    )
-    assert (
-        bad_reinit.returncode == 1
-    ), "pageserver should not be able to init new config without the node id"
+    bad_reinit = run_pageserver(good_init_cmd)
+    assert bad_reinit.returncode == 1, "pageserver refuses to init if already exists"
     assert "already exists, cannot init it" in bad_reinit.stderr
 
     bad_update = run_pageserver(["--update-config", "-c", "id = 3"])
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 3ca13a904d..56b4548b64 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -209,10 +209,12 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     env.storage_controller.node_register(env.pageserver)
 
     env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
+    env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"})
 
     env.neon_cli.create_tenant(
         tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline
     )
+
     generate_uploads_and_deletions(env, pageserver=env.pageserver)
 
     def parse_generation_suffix(key):
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 79145f61b3..e664547b69 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -1,4 +1,5 @@
 import json
+import os
 import random
 from pathlib import Path
 from typing import Any, Dict, Optional
@@ -553,3 +554,103 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
             )
         ),
     )
+
+
+@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+@pytest.mark.parametrize("via_controller", [True, False])
+def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controller: bool):
+    """
+    Test use of secondary download API for slow downloads, where slow means either a healthy
+    system with a large capacity shard, or some unhealthy remote storage.
+
+    The download API is meant to respect a client-supplied time limit, and return 200 or 202
+    selectively based on whether the download completed.
+    """
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.enable_pageserver_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+    )
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+
+    env.neon_cli.create_tenant(
+        tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}'
+    )
+
+    attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
+    ps_attached = env.get_pageserver(attached_to_id)
+    ps_secondary = next(p for p in env.pageservers if p != ps_attached)
+
+    # Generate a bunch of small layers (we will apply a slowdown failpoint that works on a per-layer basis)
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(128)
+    ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id)
+    workload.write_rows(128)
+    ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id)
+    workload.write_rows(128)
+    ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id)
+    workload.write_rows(128)
+    ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id)
+
+    # Expect lots of layers
+    assert len(list_layers(ps_attached, tenant_id, timeline_id)) > 10
+
+    # Simulate large data by making layer downloads artifically slow
+    for ps in env.pageservers:
+        ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")])
+
+    # Upload a heatmap, so that secondaries have something to download
+    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+
+    if via_controller:
+        http_client = env.storage_controller.pageserver_api()
+        http_client.tenant_location_conf(
+            tenant_id,
+            {
+                "mode": "Secondary",
+                "secondary_conf": {"warm": True},
+                "tenant_conf": {},
+                "generation": None,
+            },
+        )
+    else:
+        http_client = ps_secondary.http_client()
+
+    # This has no chance to succeed: we have lots of layers and each one takes at least 1000ms
+    (status, progress_1) = http_client.tenant_secondary_download(tenant_id, wait_ms=4000)
+    assert status == 202
+    assert progress_1["heatmap_mtime"] is not None
+    assert progress_1["layers_downloaded"] > 0
+    assert progress_1["bytes_downloaded"] > 0
+    assert progress_1["layers_total"] > progress_1["layers_downloaded"]
+    assert progress_1["bytes_total"] > progress_1["bytes_downloaded"]
+
+    # Multiple polls should work: use a shorter wait period this time
+    (status, progress_2) = http_client.tenant_secondary_download(tenant_id, wait_ms=1000)
+    assert status == 202
+    assert progress_2["heatmap_mtime"] is not None
+    assert progress_2["layers_downloaded"] > 0
+    assert progress_2["bytes_downloaded"] > 0
+    assert progress_2["layers_total"] > progress_2["layers_downloaded"]
+    assert progress_2["bytes_total"] > progress_2["bytes_downloaded"]
+
+    # Progress should be >= the first poll: this can only go backward if we see a new heatmap,
+    # and the heatmap period on the attached node is much longer than the runtime of this test, so no
+    # new heatmap should have been uploaded.
+    assert progress_2["layers_downloaded"] >= progress_1["layers_downloaded"]
+    assert progress_2["bytes_downloaded"] >= progress_1["bytes_downloaded"]
+    assert progress_2["layers_total"] == progress_1["layers_total"]
+    assert progress_2["bytes_total"] == progress_1["bytes_total"]
+
+    # Make downloads fast again: when the download completes within this last request, we
+    # get a 200 instead of a 202
+    for ps in env.pageservers:
+        ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "off")])
+    (status, progress_3) = http_client.tenant_secondary_download(tenant_id, wait_ms=20000)
+    assert status == 200
+    assert progress_3["heatmap_mtime"] is not None
+    assert progress_3["layers_total"] == progress_3["layers_downloaded"]
+    assert progress_3["bytes_total"] == progress_3["bytes_downloaded"]
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 078589d8eb..3e986a8f7b 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -596,3 +596,39 @@ def test_sql_over_http_timeout_cancel(static_proxy: NeonProxy):
     assert (
         "duplicate key value violates unique constraint" in res["message"]
     ), "HTTP query should conflict"
+
+
+def test_sql_over_http_connection_cancel(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create role http with login password 'http' superuser")
+
+    static_proxy.safe_psql("create table test_table ( id int primary key )")
+
+    # insert into a table, with a unique constraint, after sleeping for n seconds
+    query = "WITH temp AS ( \
+        SELECT pg_sleep($1) as sleep, $2::int as id \
+    ) INSERT INTO test_table (id) SELECT id FROM temp"
+
+    try:
+        # The request should complete before the proxy HTTP timeout triggers.
+        # Timeout and cancel the request on the client side before the query completes.
+        static_proxy.http_query(
+            query,
+            [static_proxy.http_timeout_seconds - 1, 1],
+            user="http",
+            password="http",
+            timeout=2,
+        )
+    except requests.exceptions.ReadTimeout:
+        pass
+
+    # wait until the query _would_ have been complete
+    time.sleep(static_proxy.http_timeout_seconds)
+
+    res = static_proxy.http_query(query, [1, 1], user="http", password="http", expected_code=200)
+    assert res["command"] == "INSERT", "HTTP query should insert"
+    assert res["rowCount"] == 1, "HTTP query should insert"
+
+    res = static_proxy.http_query(query, [0, 1], user="http", password="http", expected_code=400)
+    assert (
+        "duplicate key value violates unique constraint" in res["message"]
+    ), "HTTP query should conflict"
diff --git a/test_runner/regress/test_proxy_websockets.py b/test_runner/regress/test_proxy_websockets.py
new file mode 100644
index 0000000000..6d1cb9765a
--- /dev/null
+++ b/test_runner/regress/test_proxy_websockets.py
@@ -0,0 +1,189 @@
+import ssl
+
+import pytest
+import websockets
+from fixtures.neon_fixtures import NeonProxy
+
+
+@pytest.mark.asyncio
+async def test_websockets(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create user ws_auth with password 'ws' superuser")
+
+    user = "ws_auth"
+    password = "ws"
+
+    version = b"\x00\x03\x00\x00"
+    params = {
+        "user": user,
+        "database": "postgres",
+        "client_encoding": "UTF8",
+    }
+
+    ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
+    ssl_context.load_verify_locations(str(static_proxy.test_output_dir / "proxy.crt"))
+
+    async with websockets.connect(
+        f"wss://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+        ssl=ssl_context,
+    ) as websocket:
+        startup_message = bytearray(version)
+        for key, value in params.items():
+            startup_message.extend(key.encode("ascii"))
+            startup_message.extend(b"\0")
+            startup_message.extend(value.encode("ascii"))
+            startup_message.extend(b"\0")
+        startup_message.extend(b"\0")
+        length = (4 + len(startup_message)).to_bytes(4, byteorder="big")
+
+        await websocket.send([length, startup_message])
+
+        startup_response = await websocket.recv()
+        assert isinstance(startup_response, bytes)
+        assert startup_response[0:1] == b"R", "should be authentication message"
+        assert startup_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message"
+        assert startup_response[5:9] == b"\x00\x00\x00\x03", "should be cleartext"
+
+        auth_message = password.encode("utf-8") + b"\0"
+        length = (4 + len(auth_message)).to_bytes(4, byteorder="big")
+        await websocket.send([b"p", length, auth_message])
+
+        auth_response = await websocket.recv()
+        assert isinstance(auth_response, bytes)
+        assert auth_response[0:1] == b"R", "should be authentication message"
+        assert auth_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message"
+        assert auth_response[5:9] == b"\x00\x00\x00\x00", "should be authenticated"
+
+        query_message = "SELECT 1".encode("utf-8") + b"\0"
+        length = (4 + len(query_message)).to_bytes(4, byteorder="big")
+        await websocket.send([b"Q", length, query_message])
+
+        query_response = await websocket.recv()
+        assert isinstance(query_response, bytes)
+        # 'T\x00\x00\x00!\x00\x01?column?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17\x00\x04\xff\xff\xff\xff\x00\x00'
+        # 'D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011'
+        # 'C\x00\x00\x00\rSELECT 1\x00'
+        # 'Z\x00\x00\x00\x05I'
+
+        assert query_response[0:1] == b"T", "should be row description message"
+        row_description_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
+        row_description, query_response = (
+            query_response[:row_description_len],
+            query_response[row_description_len:],
+        )
+        assert row_description[5:7] == b"\x00\x01", "should have 1 column"
+        assert row_description[7:16] == b"?column?\0", "column should be named ?column?"
+        assert row_description[22:26] == b"\x00\x00\x00\x17", "column should be an int4"
+
+        assert query_response[0:1] == b"D", "should be data row message"
+        data_row_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
+        data_row, query_response = query_response[:data_row_len], query_response[data_row_len:]
+        assert (
+            data_row == b"D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011"
+        ), "should contain 1 column with text value 1"
+
+        assert query_response[0:1] == b"C", "should be command complete message"
+        command_complete_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
+        command_complete, query_response = (
+            query_response[:command_complete_len],
+            query_response[command_complete_len:],
+        )
+        assert command_complete == b"C\x00\x00\x00\x0dSELECT 1\0"
+
+        assert query_response[0:6] == b"Z\x00\x00\x00\x05I", "should be ready for query (idle)"
+
+        # close
+        await websocket.send(b"X\x00\x00\x00\x04")
+        await websocket.wait_closed()
+
+
+@pytest.mark.asyncio
+async def test_websockets_pipelined(static_proxy: NeonProxy):
+    """
+    Test whether we can send the startup + auth + query all in one go
+    """
+
+    static_proxy.safe_psql("create user ws_auth with password 'ws' superuser")
+
+    user = "ws_auth"
+    password = "ws"
+
+    version = b"\x00\x03\x00\x00"
+    params = {
+        "user": user,
+        "database": "postgres",
+        "client_encoding": "UTF8",
+    }
+
+    ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
+    ssl_context.load_verify_locations(str(static_proxy.test_output_dir / "proxy.crt"))
+
+    async with websockets.connect(
+        f"wss://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+        ssl=ssl_context,
+    ) as websocket:
+        startup_message = bytearray(version)
+        for key, value in params.items():
+            startup_message.extend(key.encode("ascii"))
+            startup_message.extend(b"\0")
+            startup_message.extend(value.encode("ascii"))
+            startup_message.extend(b"\0")
+        startup_message.extend(b"\0")
+        length0 = (4 + len(startup_message)).to_bytes(4, byteorder="big")
+
+        auth_message = password.encode("utf-8") + b"\0"
+        length1 = (4 + len(auth_message)).to_bytes(4, byteorder="big")
+        query_message = "SELECT 1".encode("utf-8") + b"\0"
+        length2 = (4 + len(query_message)).to_bytes(4, byteorder="big")
+        await websocket.send(
+            [length0, startup_message, b"p", length1, auth_message, b"Q", length2, query_message]
+        )
+
+        startup_response = await websocket.recv()
+        assert isinstance(startup_response, bytes)
+        assert startup_response[0:1] == b"R", "should be authentication message"
+        assert startup_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message"
+        assert startup_response[5:9] == b"\x00\x00\x00\x03", "should be cleartext"
+
+        auth_response = await websocket.recv()
+        assert isinstance(auth_response, bytes)
+        assert auth_response[0:1] == b"R", "should be authentication message"
+        assert auth_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message"
+        assert auth_response[5:9] == b"\x00\x00\x00\x00", "should be authenticated"
+
+        query_response = await websocket.recv()
+        assert isinstance(query_response, bytes)
+        # 'T\x00\x00\x00!\x00\x01?column?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17\x00\x04\xff\xff\xff\xff\x00\x00'
+        # 'D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011'
+        # 'C\x00\x00\x00\rSELECT 1\x00'
+        # 'Z\x00\x00\x00\x05I'
+
+        assert query_response[0:1] == b"T", "should be row description message"
+        row_description_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
+        row_description, query_response = (
+            query_response[:row_description_len],
+            query_response[row_description_len:],
+        )
+        assert row_description[5:7] == b"\x00\x01", "should have 1 column"
+        assert row_description[7:16] == b"?column?\0", "column should be named ?column?"
+        assert row_description[22:26] == b"\x00\x00\x00\x17", "column should be an int4"
+
+        assert query_response[0:1] == b"D", "should be data row message"
+        data_row_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
+        data_row, query_response = query_response[:data_row_len], query_response[data_row_len:]
+        assert (
+            data_row == b"D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011"
+        ), "should contain 1 column with text value 1"
+
+        assert query_response[0:1] == b"C", "should be command complete message"
+        command_complete_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
+        command_complete, query_response = (
+            query_response[:command_complete_len],
+            query_response[command_complete_len:],
+        )
+        assert command_complete == b"C\x00\x00\x00\x0dSELECT 1\0"
+
+        assert query_response[0:6] == b"Z\x00\x00\x00\x05I", "should be ready for query (idle)"
+
+        # close
+        await websocket.send(b"X\x00\x00\x00\x04")
+        await websocket.wait_closed()
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 9309af066b..cb58c640c3 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1,10 +1,15 @@
 import os
-from typing import Dict, List, Union
+import time
+from typing import Dict, List, Optional, Union
 
 import pytest
+import requests
+from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    NeonEnv,
     NeonEnvBuilder,
+    StorageControllerApiException,
     tenant_get_shards,
 )
 from fixtures.remote_storage import s3_storage
@@ -259,7 +264,7 @@ def test_sharding_split_smoke(
         destination = migrate_to_pageserver_ids.pop()
 
         log.info(f"Migrating shard {migrate_shard} from {ps_id} to {destination}")
-        env.neon_cli.tenant_migrate(migrate_shard, destination, timeout_secs=10)
+        env.storage_controller.tenant_shard_migrate(migrate_shard, destination)
 
     workload.validate()
 
@@ -294,7 +299,7 @@ def test_sharding_split_smoke(
         locations = pageserver.http_client().tenant_list_locations()
         shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"])
 
-    log.info("Shards after split: {shards_exist}")
+    log.info(f"Shards after split: {shards_exist}")
     assert len(shards_exist) == split_shard_count
 
     # Ensure post-split pageserver locations survive a restart (i.e. the child shards
@@ -495,3 +500,482 @@ def test_sharding_ingest(
 
     # Each shard may emit up to one huge layer, because initdb ingest doesn't respect checkpoint_distance.
     assert huge_layer_count <= shard_count
+
+
+class Failure:
+    pageserver_id: Optional[int]
+
+    def apply(self, env: NeonEnv):
+        raise NotImplementedError()
+
+    def clear(self, env: NeonEnv):
+        """
+        Clear the failure, in a way that should enable the system to proceed
+        to a totally clean state (all nodes online and reconciled)
+        """
+        raise NotImplementedError()
+
+    def expect_available(self):
+        raise NotImplementedError()
+
+    def can_mitigate(self):
+        """Whether Self.mitigate is available for use"""
+        return False
+
+    def mitigate(self, env: NeonEnv):
+        """
+        Mitigate the failure in a way that should allow shard split to
+        complete and service to resume, but does not guarantee to leave
+        the whole world in a clean state (e.g. an Offline node might have
+        junk LocationConfigs on it)
+        """
+        raise NotImplementedError()
+
+    def fails_forward(self, env: NeonEnv):
+        """
+        If true, this failure results in a state that eventualy completes the split.
+        """
+        return False
+
+    def expect_exception(self):
+        """
+        How do we expect a call to the split API to fail?
+        """
+        return StorageControllerApiException
+
+
+class PageserverFailpoint(Failure):
+    def __init__(self, failpoint, pageserver_id, mitigate):
+        self.failpoint = failpoint
+        self.pageserver_id = pageserver_id
+        self._mitigate = mitigate
+
+    def apply(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.allowed_errors.extend(
+            [".*failpoint.*", ".*Resetting.*after shard split failure.*"]
+        )
+        pageserver.http_client().configure_failpoints((self.failpoint, "return(1)"))
+
+    def clear(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.http_client().configure_failpoints((self.failpoint, "off"))
+        if self._mitigate:
+            env.storage_controller.node_configure(self.pageserver_id, {"availability": "Active"})
+
+    def expect_available(self):
+        return True
+
+    def can_mitigate(self):
+        return self._mitigate
+
+    def mitigate(self, env):
+        env.storage_controller.node_configure(self.pageserver_id, {"availability": "Offline"})
+
+
+class StorageControllerFailpoint(Failure):
+    def __init__(self, failpoint, action):
+        self.failpoint = failpoint
+        self.pageserver_id = None
+        self.action = action
+
+    def apply(self, env: NeonEnv):
+        env.storage_controller.configure_failpoints((self.failpoint, self.action))
+
+    def clear(self, env: NeonEnv):
+        if "panic" in self.action:
+            log.info("Restarting storage controller after panic")
+            env.storage_controller.stop()
+            env.storage_controller.start()
+        else:
+            env.storage_controller.configure_failpoints((self.failpoint, "off"))
+
+    def expect_available(self):
+        # Controller panics _do_ leave pageservers available, but our test code relies
+        # on using the locate API to update configurations in Workload, so we must skip
+        # these actions when the controller has been panicked.
+        return "panic" not in self.action
+
+    def can_mitigate(self):
+        return False
+
+    def fails_forward(self, env):
+        # Edge case: the very last failpoint that simulates a DB connection error, where
+        # the abort path will fail-forward and result in a complete split.
+        fail_forward = self.failpoint == "shard-split-post-complete"
+
+        # If the failure was a panic, then if we expect split to eventually (after restart)
+        # complete, we must restart before checking that.
+        if fail_forward and "panic" in self.action:
+            log.info("Restarting storage controller after panic")
+            env.storage_controller.stop()
+            env.storage_controller.start()
+
+        return fail_forward
+
+    def expect_exception(self):
+        if "panic" in self.action:
+            return requests.exceptions.ConnectionError
+        else:
+            return StorageControllerApiException
+
+
+class NodeKill(Failure):
+    def __init__(self, pageserver_id, mitigate):
+        self.pageserver_id = pageserver_id
+        self._mitigate = mitigate
+
+    def apply(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.stop(immediate=True)
+
+    def clear(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.start()
+
+    def expect_available(self):
+        return False
+
+    def mitigate(self, env):
+        env.storage_controller.node_configure(self.pageserver_id, {"availability": "Offline"})
+
+
+class CompositeFailure(Failure):
+    """
+    Wrapper for failures in multiple components (e.g. a failpoint in the storage controller, *and*
+    stop a pageserver to interfere with rollback)
+    """
+
+    def __init__(self, failures: list[Failure]):
+        self.failures = failures
+
+        self.pageserver_id = None
+        for f in failures:
+            if f.pageserver_id is not None:
+                self.pageserver_id = f.pageserver_id
+                break
+
+    def apply(self, env: NeonEnv):
+        for f in self.failures:
+            f.apply(env)
+
+    def clear(self, env):
+        for f in self.failures:
+            f.clear(env)
+
+    def expect_available(self):
+        return all(f.expect_available() for f in self.failures)
+
+    def mitigate(self, env):
+        for f in self.failures:
+            f.mitigate(env)
+
+    def expect_exception(self):
+        expect = set(f.expect_exception() for f in self.failures)
+
+        # We can't give a sensible response if our failures have different expectations
+        assert len(expect) == 1
+
+        return list(expect)[0]
+
+
+@pytest.mark.parametrize(
+    "failure",
+    [
+        PageserverFailpoint("api-500", 1, False),
+        NodeKill(1, False),
+        PageserverFailpoint("api-500", 1, True),
+        NodeKill(1, True),
+        PageserverFailpoint("shard-split-pre-prepare", 1, False),
+        PageserverFailpoint("shard-split-post-prepare", 1, False),
+        PageserverFailpoint("shard-split-pre-hardlink", 1, False),
+        PageserverFailpoint("shard-split-post-hardlink", 1, False),
+        PageserverFailpoint("shard-split-post-child-conf", 1, False),
+        PageserverFailpoint("shard-split-lsn-wait", 1, False),
+        PageserverFailpoint("shard-split-pre-finish", 1, False),
+        StorageControllerFailpoint("shard-split-validation", "return(1)"),
+        StorageControllerFailpoint("shard-split-post-begin", "return(1)"),
+        StorageControllerFailpoint("shard-split-post-remote", "return(1)"),
+        StorageControllerFailpoint("shard-split-post-complete", "return(1)"),
+        StorageControllerFailpoint("shard-split-validation", "panic(failpoint)"),
+        StorageControllerFailpoint("shard-split-post-begin", "panic(failpoint)"),
+        StorageControllerFailpoint("shard-split-post-remote", "panic(failpoint)"),
+        StorageControllerFailpoint("shard-split-post-complete", "panic(failpoint)"),
+        CompositeFailure(
+            [NodeKill(1, True), StorageControllerFailpoint("shard-split-post-begin", "return(1)")]
+        ),
+        CompositeFailure(
+            [NodeKill(1, False), StorageControllerFailpoint("shard-split-post-begin", "return(1)")]
+        ),
+    ],
+)
+def test_sharding_split_failures(
+    neon_env_builder: NeonEnvBuilder,
+    compute_reconfigure_listener: ComputeReconfigure,
+    failure: Failure,
+):
+    neon_env_builder.num_pageservers = 4
+    neon_env_builder.control_plane_compute_hook_api = (
+        compute_reconfigure_listener.control_plane_compute_hook_api
+    )
+    initial_shard_count = 2
+    split_shard_count = 4
+
+    env = neon_env_builder.init_start(initial_tenant_shard_count=initial_shard_count)
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    env.storage_controller.allowed_errors.extend(
+        [
+            # All split failures log a warning when then enqueue the abort operation
+            ".*Enqueuing background abort.*",
+            # We exercise failure cases where abort itself will also fail (node offline)
+            ".*abort_tenant_shard_split.*",
+            ".*Failed to abort.*",
+            # Tolerate any error lots that mention a failpoint
+            ".*failpoint.*",
+            # Node offline cases will fail to send requests
+            ".*Reconcile error: receive body: error sending request for url.*",
+        ]
+    )
+
+    for ps in env.pageservers:
+        # When we do node failures and abandon a shard, it will de-facto have old generation and
+        # thereby be unable to publish remote consistent LSN updates
+        ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
+
+        # If we're using a failure that will panic the storage controller, all background
+        # upcalls from the pageserver can fail
+        ps.allowed_errors.append(".*calling control plane generation validation API failed.*")
+
+    # Make sure the node we're failing has a shard on it, otherwise the test isn't testing anything
+    assert (
+        failure.pageserver_id is None
+        or len(
+            env.get_pageserver(failure.pageserver_id)
+            .http_client()
+            .tenant_list_locations()["tenant_shards"]
+        )
+        > 0
+    )
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(100)
+
+    # Put the environment into a failing state (exact meaning depends on `failure`)
+    failure.apply(env)
+
+    with pytest.raises(failure.expect_exception()):
+        env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)
+
+    # We expect that the overall operation will fail, but some split requests
+    # will have succeeded: the net result should be to return to a clean state, including
+    # detaching any child shards.
+    def assert_rolled_back(exclude_ps_id=None) -> None:
+        count = 0
+        for ps in env.pageservers:
+            if exclude_ps_id is not None and ps.id == exclude_ps_id:
+                continue
+
+            locations = ps.http_client().tenant_list_locations()["tenant_shards"]
+            for loc in locations:
+                tenant_shard_id = TenantShardId.parse(loc[0])
+                log.info(f"Shard {tenant_shard_id} seen on node {ps.id}")
+                assert tenant_shard_id.shard_count == initial_shard_count
+                count += 1
+        assert count == initial_shard_count
+
+    def assert_split_done(exclude_ps_id=None) -> None:
+        count = 0
+        for ps in env.pageservers:
+            if exclude_ps_id is not None and ps.id == exclude_ps_id:
+                continue
+
+            locations = ps.http_client().tenant_list_locations()["tenant_shards"]
+            for loc in locations:
+                tenant_shard_id = TenantShardId.parse(loc[0])
+                log.info(f"Shard {tenant_shard_id} seen on node {ps.id}")
+                assert tenant_shard_id.shard_count == split_shard_count
+                count += 1
+        assert count == split_shard_count
+
+    def finish_split():
+        # Having failed+rolled back, we should be able to split again
+        # No failures this time; it will succeed
+        env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count)
+
+        workload.churn_rows(10)
+        workload.validate()
+
+    if failure.expect_available():
+        # Even though the split failed partway through, this should not have interrupted
+        # clients.  Disable waiting for pageservers in the workload helper, because our
+        # failpoints may prevent API access.
+        # This only applies for failure modes that leave pageserver page_service API available.
+        workload.churn_rows(10, upload=False, ingest=False)
+        workload.validate()
+
+    if failure.fails_forward(env):
+        log.info("Fail-forward failure, checking split eventually completes...")
+        # A failure type which results in eventual completion of the split
+        wait_until(30, 1, assert_split_done)
+    elif failure.can_mitigate():
+        log.info("Mitigating failure...")
+        # Mitigation phase: we expect to be able to proceed with a successful shard split
+        failure.mitigate(env)
+
+        # The split should appear to be rolled back from the point of view of all pageservers
+        # apart from the one that is offline
+        wait_until(30, 1, lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id))
+
+        finish_split()
+        wait_until(30, 1, lambda: assert_split_done(exclude_ps_id=failure.pageserver_id))
+
+        # Having cleared the failure, everything should converge to a pristine state
+        failure.clear(env)
+        wait_until(30, 1, assert_split_done)
+    else:
+        # Once we restore the faulty pageserver's API to good health, rollback should
+        # eventually complete.
+        log.info("Clearing failure...")
+        failure.clear(env)
+
+        wait_until(30, 1, assert_rolled_back)
+
+        # Having rolled back, the tenant should be working
+        workload.churn_rows(10)
+        workload.validate()
+
+        # Splitting again should work, since we cleared the failure
+        finish_split()
+        assert_split_done()
+
+    env.storage_controller.consistency_check()
+
+
+def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
+    """
+    Check a scenario when one of the shards is much slower than others.
+    Without backpressure, this would lead to the slow shard falling behind
+    and eventually causing WAL timeouts.
+    """
+
+    shard_count = 4
+    neon_env_builder.num_pageservers = shard_count
+
+    # 256KiB stripes: enable getting some meaningful data distribution without
+    # writing large quantities of data in this test.  The stripe size is given
+    # in number of 8KiB pages.
+    stripe_size = 32
+
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size
+    )
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    pageservers = dict((int(p.id), p) for p in env.pageservers)
+    shards = env.storage_controller.locate(tenant_id)
+
+    # Slow down one of the shards, around ~1MB/s
+    pageservers[4].http_client().configure_failpoints(("wal-ingest-record-sleep", "5%sleep(1)"))
+
+    def shards_info():
+        infos = []
+        for shard in shards:
+            node_id = int(shard["node_id"])
+            pageserver = pageservers[node_id]
+            shard_info = pageserver.http_client().timeline_detail(shard["shard_id"], timeline_id)
+            infos.append(shard_info)
+            last_record_lsn = shard_info["last_record_lsn"]
+            current_physical_size = shard_info["current_physical_size"]
+            log.info(
+                f"Shard on pageserver {node_id}: lsn={last_record_lsn}, size={current_physical_size}"
+            )
+        return infos
+
+    shards_info()
+
+    workload = Workload(
+        env,
+        tenant_id,
+        timeline_id,
+        branch_name="main",
+        endpoint_opts={
+            "config_lines": [
+                # Tip: set to 100MB to make the test fail
+                "max_replication_write_lag=1MB",
+            ],
+        },
+    )
+    workload.init()
+
+    endpoint = workload.endpoint()
+
+    # on 2024-03-05, the default config on prod was [15MB, 10GB, null]
+    res = endpoint.safe_psql_many(
+        [
+            "SHOW max_replication_write_lag",
+            "SHOW max_replication_flush_lag",
+            "SHOW max_replication_apply_lag",
+        ]
+    )
+    log.info(f"backpressure config: {res}")
+
+    last_flush_lsn = None
+    last_timestamp = None
+
+    def update_write_lsn():
+        nonlocal last_flush_lsn
+        nonlocal last_timestamp
+
+        res = endpoint.safe_psql(
+            """
+            SELECT
+                pg_wal_lsn_diff(pg_current_wal_flush_lsn(), received_lsn) as received_lsn_lag,
+                received_lsn,
+                pg_current_wal_flush_lsn() as flush_lsn,
+                neon.backpressure_throttling_time() as throttling_time
+            FROM neon.backpressure_lsns();
+            """,
+            dbname="postgres",
+        )[0]
+        log.info(
+            f"received_lsn_lag = {res[0]}, received_lsn = {res[1]}, flush_lsn = {res[2]}, throttling_time = {res[3]}"
+        )
+
+        lsn = Lsn(res[2])
+        now = time.time()
+
+        if last_timestamp is not None:
+            delta = now - last_timestamp
+            delta_bytes = lsn - last_flush_lsn
+            avg_speed = delta_bytes / delta / 1024 / 1024
+            log.info(
+                f"flush_lsn {lsn}, written {delta_bytes/1024}kb for {delta:.3f}s, avg_speed {avg_speed:.3f} MiB/s"
+            )
+
+        last_flush_lsn = lsn
+        last_timestamp = now
+
+    update_write_lsn()
+
+    workload.write_rows(4096, upload=False)
+    workload.write_rows(4096, upload=False)
+    workload.write_rows(4096, upload=False)
+    workload.write_rows(4096, upload=False)
+    workload.validate()
+
+    update_write_lsn()
+    shards_info()
+
+    for _write_iter in range(30):
+        # approximately 1MB of data
+        workload.write_rows(8000, upload=False)
+        update_write_lsn()
+        infos = shards_info()
+        min_lsn = min(Lsn(info["last_record_lsn"]) for info in infos)
+        max_lsn = max(Lsn(info["last_record_lsn"]) for info in infos)
+        diff = max_lsn - min_lsn
+        assert diff < 2 * 1024 * 1024, f"LSN diff={diff}, expected diff < 2MB due to backpressure"
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py
index 7a0707b564..a6b0f76c96 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -177,6 +177,7 @@ def test_node_status_after_restart(
     assert len(nodes) == 2
 
     env.pageservers[1].stop()
+    env.storage_controller.allowed_errors.extend([".*Could not scan node"])
 
     env.storage_controller.stop()
     env.storage_controller.start()
@@ -681,6 +682,9 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
     tenant_id = TenantId.generate()
     body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}
 
+    env.storage_controller.allowed_errors.append(".*Unauthorized.*")
+    env.storage_controller.allowed_errors.append(".*Forbidden.*")
+
     # No token
     with pytest.raises(
         StorageControllerApiException,
@@ -769,3 +773,178 @@ def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder):
     assert "pitr_interval" not in readback_ps.tenant_specific_overrides
 
     env.storage_controller.consistency_check()
+
+
+class Failure:
+    pageserver_id: int
+
+    def apply(self, env: NeonEnv):
+        raise NotImplementedError()
+
+    def clear(self, env: NeonEnv):
+        raise NotImplementedError()
+
+
+class NodeStop(Failure):
+    def __init__(self, pageserver_id, immediate):
+        self.pageserver_id = pageserver_id
+        self.immediate = immediate
+
+    def apply(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.stop(immediate=self.immediate)
+
+    def clear(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.start()
+
+
+class PageserverFailpoint(Failure):
+    def __init__(self, failpoint, pageserver_id):
+        self.failpoint = failpoint
+        self.pageserver_id = pageserver_id
+
+    def apply(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.http_client().configure_failpoints((self.failpoint, "return(1)"))
+
+    def clear(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.http_client().configure_failpoints((self.failpoint, "off"))
+
+
+def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]:
+    tenants = env.storage_controller.tenant_list()
+
+    node_to_tenants: dict[int, list[TenantId]] = {}
+    for t in tenants:
+        for node_id, loc_state in t["observed"]["locations"].items():
+            if (
+                loc_state is not None
+                and "conf" in loc_state
+                and loc_state["conf"] is not None
+                and loc_state["conf"]["mode"] == "AttachedSingle"
+            ):
+                crnt = node_to_tenants.get(int(node_id), [])
+                crnt.append(TenantId(t["tenant_shard_id"]))
+                node_to_tenants[int(node_id)] = crnt
+
+    return node_to_tenants
+
+
+@pytest.mark.parametrize(
+    "failure",
+    [
+        NodeStop(pageserver_id=1, immediate=False),
+        NodeStop(pageserver_id=1, immediate=True),
+        PageserverFailpoint(pageserver_id=1, failpoint="get-utilization-http-handler"),
+    ],
+)
+def test_sharding_service_heartbeats(
+    neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, failure: Failure
+):
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    # Default log allow list permits connection errors, but this test will use error responses on
+    # the utilization endpoint.
+    env.storage_controller.allowed_errors.append(
+        ".*Call to node.*management API.*failed.*failpoint.*"
+    )
+
+    # Initially we have two online pageservers
+    nodes = env.storage_controller.node_list()
+    assert len(nodes) == 2
+    assert all([n["availability"] == "Active" for n in nodes])
+
+    # ... then we create two tenants and write some data into them
+    def create_tenant(tid: TenantId):
+        env.storage_controller.tenant_create(tid)
+
+        branch_name = "main"
+        env.neon_cli.create_timeline(
+            branch_name,
+            tenant_id=tid,
+        )
+
+        with env.endpoints.create_start("main", tenant_id=tid) as endpoint:
+            run_pg_bench_small(pg_bin, endpoint.connstr())
+            endpoint.safe_psql("CREATE TABLE created_foo(id integer);")
+
+    tenant_ids = [TenantId.generate(), TenantId.generate()]
+    for tid in tenant_ids:
+        create_tenant(tid)
+
+    # ... expecting that each tenant will be placed on a different node
+    def tenants_placed():
+        node_to_tenants = build_node_to_tenants_map(env)
+        log.info(f"{node_to_tenants=}")
+
+        # Check that all the tenants have been attached
+        assert sum((len(ts) for ts in node_to_tenants.values())) == len(tenant_ids)
+        # Check that each node got one tenant
+        assert all((len(ts) == 1 for ts in node_to_tenants.values()))
+
+    wait_until(10, 1, tenants_placed)
+
+    # ... then we apply the failure
+    offline_node_id = failure.pageserver_id
+    online_node_id = (set(range(1, len(env.pageservers) + 1)) - {offline_node_id}).pop()
+    env.get_pageserver(offline_node_id).allowed_errors.append(
+        # In the case of the failpoint failure, the impacted pageserver
+        # still believes it has the tenant attached since location
+        # config calls into it will fail due to being marked offline.
+        ".*Dropped remote consistent LSN updates.*",
+    )
+
+    failure.apply(env)
+
+    # ... expecting the heartbeats to mark it offline
+    def node_offline():
+        nodes = env.storage_controller.node_list()
+        log.info(f"{nodes=}")
+        target = next(n for n in nodes if n["id"] == offline_node_id)
+        assert target["availability"] == "Offline"
+
+    # A node is considered offline if the last successful heartbeat
+    # was more than 10 seconds ago (hardcoded in the storage controller).
+    wait_until(20, 1, node_offline)
+
+    # .. expecting the tenant on the offline node to be migrated
+    def tenant_migrated():
+        node_to_tenants = build_node_to_tenants_map(env)
+        log.info(f"{node_to_tenants=}")
+        assert set(node_to_tenants[online_node_id]) == set(tenant_ids)
+
+    wait_until(10, 1, tenant_migrated)
+
+    # ... then we clear the failure
+    failure.clear(env)
+
+    # ... expecting the offline node to become active again
+    def node_online():
+        nodes = env.storage_controller.node_list()
+        target = next(n for n in nodes if n["id"] == offline_node_id)
+        assert target["availability"] == "Active"
+
+    wait_until(10, 1, node_online)
+
+    time.sleep(5)
+
+    # ... then we create a new tenant
+    tid = TenantId.generate()
+    env.storage_controller.tenant_create(tid)
+
+    # ... expecting it to be placed on the node that just came back online
+    tenants = env.storage_controller.tenant_list()
+    newest_tenant = next(t for t in tenants if t["tenant_shard_id"] == str(tid))
+    locations = list(newest_tenant["observed"]["locations"].keys())
+    locations = [int(node_id) for node_id in locations]
+    assert locations == [offline_node_id]
+
+    # ... expecting the storage controller to reach a consistent state
+    def storage_controller_consistent():
+        env.storage_controller.consistency_check()
+
+    wait_until(10, 1, storage_controller_consistent)
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 52de889084..a164c7f60a 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -184,7 +184,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
             # allow errors caused by failpoints
             f".*failpoint: {failpoint}",
             # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
-            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
             # We may leave some upload tasks in the queue. They're likely deletes.
             # For uploads we explicitly wait with `last_flush_lsn_upload` below.
             # So by ignoring these instead of waiting for empty upload queue
@@ -327,7 +327,7 @@ def test_tenant_delete_is_resumed_on_attach(
             # From deletion polling
             f".*NotFound: tenant {env.initial_tenant}.*",
             # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
-            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
             # error from http response is also logged
             ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
             '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 7cea301a9c..025cc930d7 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -1,3 +1,4 @@
+import os
 from pathlib import Path
 from typing import List, Tuple
 
@@ -326,7 +327,7 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Pa
     size_debug_file.write(size_debug)
 
 
-@pytest.mark.xfail
+@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
 def test_single_branch_get_tenant_size_grows(
     neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_version: PgVersion
 ):
@@ -349,10 +350,21 @@ def test_single_branch_get_tenant_size_grows(
     # adjust the gc_horizon accordingly.
     if pg_version == PgVersion.V14:
         gc_horizon = 0x4A000
+    elif pg_version == PgVersion.V15:
+        gc_horizon = 0x3BA00
+    elif pg_version == PgVersion.V16:
+        gc_horizon = 210000
+    else:
+        raise NotImplementedError(pg_version)
 
-    neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}"
+    tenant_config = {
+        "compaction_period": "0s",
+        "gc_period": "0s",
+        "pitr_interval": "0s",
+        "gc_horizon": gc_horizon,
+    }
 
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf=tenant_config)
 
     tenant_id = env.initial_tenant
     branch_name, timeline_id = env.neon_cli.list_timelines(tenant_id)[0]
@@ -405,6 +417,7 @@ def test_single_branch_get_tenant_size_grows(
             current_lsn = after_lsn
         size_debug_file.write(size_debug)
         assert size > 0
+        log.info(f"size: {size} at lsn {current_lsn}")
         return (current_lsn, size)
 
     with env.endpoints.create_start(
@@ -492,24 +505,41 @@ def test_single_branch_get_tenant_size_grows(
 
             collected_responses.append(("DELETE", current_lsn, size))
 
+        size_before_drop = get_current_consistent_size(
+            env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
+        )[1]
+
         with endpoint.cursor() as cur:
             cur.execute("DROP TABLE t0")
 
+        # Without setting a PITR interval, dropping the table doesn't reclaim any space
+        # from the user's point of view, because the DROP transaction is too small
+        # to fall out of gc_horizon.
+        (current_lsn, size) = get_current_consistent_size(
+            env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
+        )
+        prev_size = collected_responses[-1][2]
+        check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
+
+        # Set a tiny PITR interval to allow the DROP to impact the synthetic size
+        # Because synthetic size calculation uses pitr interval when available,
+        # when our tenant is configured with a tiny pitr interval, dropping a table should
+        # cause synthetic size to go down immediately
+        tenant_config["pitr_interval"] = "1ms"
+        env.pageserver.http_client().set_tenant_config(tenant_id, tenant_config)
+        (current_lsn, size) = get_current_consistent_size(
+            env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
+        )
+        assert size < size_before_drop
+
         # The size of the tenant should still be as large as before we dropped
         # the table, because the drop operation can still be undone in the PITR
         # defined by gc_horizon.
-        (current_lsn, size) = get_current_consistent_size(
-            env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
-        )
-
-        prev_size = collected_responses[-1][2]
-
-        check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
-
         collected_responses.append(("DROP", current_lsn, size))
 
     # Should have gone past gc_horizon, otherwise gc_horizon is too large
-    assert current_lsn - initdb_lsn > gc_horizon
+    bytes_written = current_lsn - initdb_lsn
+    assert bytes_written > gc_horizon
 
     # this isn't too many lines to forget for a while. observed while
     # developing these tests that locally the value is a bit more than what we
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 1e13a2f20f..f8701b65d7 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -36,7 +36,9 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
     )
     [d for d in tenants_dir.iterdir()]
 
-    neon_simple_env.pageserver.allowed_errors.append(".*tenant-config-before-write.*")
+    error_regexes = [".*tenant-config-before-write.*"]
+    neon_simple_env.pageserver.allowed_errors.extend(error_regexes)
+    neon_simple_env.storage_controller.allowed_errors.extend(error_regexes)
 
     pageserver_http = neon_simple_env.pageserver.http_client()
     pageserver_http.configure_failpoints(("tenant-config-before-write", "return"))
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 96a5cc491a..0eb1327c9e 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -204,7 +204,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
         [
             f".*{timeline_id}.*failpoint: {failpoint}",
             # It appears when we stopped flush loop during deletion and then pageserver is stopped
-            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
             # This happens when we fail before scheduling background operation.
             # Timeline is left in stopping state and retry tries to stop it again.
             ".*Ignoring new state, equal to the existing one: Stopping",
@@ -398,7 +398,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
             ".*failpoint: timeline-delete-before-rm",
             ".*Ignoring new state, equal to the existing one: Stopping",
             # this happens, because the stuck timeline is visible to shutdown
-            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
         ]
     )
 
@@ -809,7 +809,7 @@ def test_timeline_delete_resumed_on_attach(
             # allow errors caused by failpoints
             f".*failpoint: {failpoint}",
             # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
-            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
             # error from http response is also logged
             ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
             # Polling after attach may fail with this
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 205ca18050..628c484fbd 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -20,6 +20,7 @@ from fixtures.neon_fixtures import (
     VanillaPostgres,
     wait_for_last_flush_lsn,
 )
+from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
     assert_tenant_state,
     timeline_delete_wait_completed,
@@ -684,6 +685,13 @@ def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
     # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
 
 
+def wait_for_tenant_startup_completions(client: PageserverHttpClient, count: int):
+    def condition():
+        assert client.get_metric_value("pageserver_tenant_startup_complete_total") == count
+
+    wait_until(5, 1.0, condition)
+
+
 def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     """
     Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete
@@ -767,10 +775,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     # That one that we successfully accessed is now Active
     expect_activated += 1
     assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active"
-    assert (
-        pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
-        == expect_activated - 1
-    )
+    wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1)
 
     # The ones we didn't touch are still in Attaching
     assert (
@@ -790,10 +795,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
         == n_tenants - expect_activated
     )
 
-    assert (
-        pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
-        == expect_activated - 1
-    )
+    wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1)
 
     # When we unblock logical size calculation, all tenants should proceed to active state via
     # the warmup route.
@@ -813,7 +815,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     assert (
         pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
     )
-    assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants
+    wait_for_tenant_startup_completions(pageserver_http, count=n_tenants)
 
     # Check that tenant deletion/detach proactively wakes tenants: this is done separately to the main
     # body of the test because it will disrupt tenant counts
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index b980d6f090..3b09894ddb 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit b980d6f090c676e55fb2c830fb2434f532f635c0
+Subproject commit 3b09894ddb8825b50c963942059eab1a2a0b0a89
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 56f32c0e73..80cef885ad 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 56f32c0e7330d17aaeee8bf211a73995180bd133
+Subproject commit 80cef885add1af6741aa31944c7d2c84d8f9098f
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 1941c235ee..ae524d70b1 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
   "postgres-v16": "90078947229aa7f9ac5f7ed4527b2c7386d5332b",
-  "postgres-v15": "56f32c0e7330d17aaeee8bf211a73995180bd133",
-  "postgres-v14": "b980d6f090c676e55fb2c830fb2434f532f635c0"
+  "postgres-v15": "80cef885add1af6741aa31944c7d2c84d8f9098f",
+  "postgres-v14": "3b09894ddb8825b50c963942059eab1a2a0b0a89"
 }
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 8593b752c2..152c452dd4 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -64,6 +64,7 @@ rustls = { version = "0.21", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
+sha2 = { version = "0.10", features = ["asm"] }
 smallvec = { version = "1", default-features = false, features = ["write"] }
 subtle = { version = "2" }
 time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }