storage broker

proxy
move to hyper 1.0 mostly
2026-01-18 19:02:56 +00:00 · 2024-01-18 14:43:29 +00:00 · 2024-01-18 14:11:35 +00:00 · 2024-01-18 12:44:50 +00:00 · 2024-01-18 10:52:18 +00:00 · 2024-01-18 12:39:45 +02:00
109 changed files with 7370 additions and 3105 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1131,7 +1131,7 @@ jobs:
            # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
+            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
            exit 1
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,6 +3,7 @@ resolver = "2"
 members = [
    "compute_tools",
    "control_plane",
+    "control_plane/attachment_service",
    "pageserver",
    "pageserver/ctl",
    "pageserver/client",
@@ -50,7 +51,7 @@ async-trait = "0.1"
 aws-config = { version = "1.0", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "1.0"
 aws-smithy-async = { version = "1.0", default-features = false, features=["rt-tokio"] }
-aws-smithy-types = "1.0"
+aws-smithy-types = { version = "1.1.2", features = ["http-body-1-x"] }
 aws-credential-types = "1.0"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
@@ -88,7 +89,12 @@ hostname = "0.3.1"
 http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
-hyper = "0.14"
+hyper = "1.0.0"
+hyper-util = "0.1.0"
+http = "1"
+http-body = "1"
+http-body-util = "0.1"
+hyper-tungstenite = "0.13.0"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
@@ -111,7 +117,7 @@ parquet_derive = "49.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
-prost = "0.11"
+prost = "0.12"
 rand = "0.8"
 redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
@@ -119,7 +125,7 @@ reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"
 reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
 reqwest-middleware = "0.2.0"
 reqwest-retry = "0.2.2"
-routerify = "3"
+routerify = { git = "https://github.com/conradludgate/routerify", branch = "hyper1" }
 rpds = "0.13"
 rustc-hash = "1.1.0"
 rustls = "0.21"
@@ -147,7 +153,7 @@ tar = "0.4"
 task-local-extensions = "0.1.4"
 test-context = "0.1"
 thiserror = "1.0"
-tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
+tls-listener = { version = "0.9", features = ["rustls", "tokio-net"] }
 tokio = { version = "1.17", features = ["macros"] }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.10.0"
@@ -155,15 +161,13 @@ tokio-rustls = "0.24"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
-tokio-tungstenite = "0.20"
 toml = "0.7"
 toml_edit = "0.19"
-tonic = {version = "0.9", features = ["tls", "tls-roots"]}
+tonic = {version = "0.10", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.19.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
-tungstenite = "0.20"
 url = "2.2"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
@@ -211,7 +215,7 @@ criterion = "0.5.1"
 rcgen = "0.11"
 rstest = "0.18"
 camino-tempfile = "1.0.2"
-tonic-build = "0.9"
+tonic-build = "0.10.2"

 [patch.crates-io]

--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -12,7 +12,11 @@ cfg-if.workspace = true
 clap.workspace = true
 flate2.workspace = true
 futures.workspace = true
-hyper = { workspace = true, features = ["full"] }
+hyper = { workspace = true, features = ["server"] }
+hyper-util = { workspace = true, features = ["tokio", "server", "server-auto"] }
+http = { workspace = true, features = [] }
+http-body = { workspace = true, features = [] }
+http-body-util = { workspace = true, features = [] }
 nix.workspace = true
 notify.workspace = true
 num_cpus.workspace = true
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -231,9 +231,9 @@ fn main() -> Result<()> {
    let compute = Arc::new(compute_node);

    // If this is a pooled VM, prewarm before starting HTTP server and becoming
-    // available for binding. Prewarming helps postgres start quicker later,
+    // available for binding. Prewarming helps Postgres start quicker later,
    // because QEMU will already have it's memory allocated from the host, and
-    // the necessary binaries will alreaady be cached.
+    // the necessary binaries will already be cached.
    if !spec_set {
        compute.prewarm_postgres()?;
    }
@@ -276,6 +276,11 @@ fn main() -> Result<()> {

    state.status = ComputeStatus::Init;
    compute.state_changed.notify_all();
+
+    info!(
+        "running compute with features: {:?}",
+        state.pspec.as_ref().unwrap().spec.features
+    );
    drop(state);

    // Launch remaining service threads
@@ -288,7 +293,7 @@ fn main() -> Result<()> {
    let pg = match compute.start_compute(extension_server_port) {
        Ok(pg) => Some(pg),
        Err(err) => {
-            error!("could not start the compute node: {:?}", err);
+            error!("could not start the compute node: {:#}", err);
            let mut state = compute.state.lock().unwrap();
            state.error = Some(format!("{:?}", err));
            state.status = ComputeStatus::Failed;
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -20,7 +20,7 @@ use futures::StreamExt;
 use postgres::{Client, NoTls};
 use tokio;
 use tokio_postgres;
-use tracing::{error, info, instrument, warn};
+use tracing::{debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -280,7 +280,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
            $$;"#,
        roles_decl, database_decl,
    );
-    info!("Neon superuser created:\n{}", inlinify(&query));
+    info!("Neon superuser created: {}", inlinify(&query));
    client
        .simple_query(&query)
        .map_err(|e| anyhow::anyhow!(e).context(query))?;
@@ -964,6 +964,16 @@ impl ComputeNode {
        Ok(pg_process)
    }

+    /// Update the `last_active` in the shared state, but ensure that it's a more recent one.
+    pub fn update_last_active(&self, last_active: Option<DateTime<Utc>>) {
+        let mut state = self.state.lock().unwrap();
+        // NB: `Some(<DateTime>)` is always greater than `None`.
+        if last_active > state.last_active {
+            state.last_active = last_active;
+            debug!("set the last compute activity time to: {:?}", last_active);
+        }
+    }
+
    // Look for core dumps and collect backtraces.
    //
    // EKS worker nodes have following core dump settings:
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -6,14 +6,22 @@ use std::sync::Arc;
 use std::thread;

 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
+use bytes::Bytes;
 use compute_api::requests::ConfigurationRequest;
 use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};

 use anyhow::Result;
-use hyper::service::{make_service_fn, service_fn};
-use hyper::{Body, Method, Request, Response, Server, StatusCode};
+use http_body_util::BodyExt;
+use http_body_util::Full;
+use hyper::body::Incoming;
+use hyper::service::service_fn;
+use hyper::{Method, Request, Response, StatusCode};
+use hyper_util::rt::TokioExecutor;
+use hyper_util::rt::TokioIo;
+use hyper_util::server::conn;
 use num_cpus;
 use serde_json;
+use tokio::net::TcpListener;
 use tokio::task;
 use tracing::{error, info, warn};
 use tracing_utils::http::OtelName;
@@ -36,7 +44,7 @@ fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
 }

 // Service function to handle all available routes.
-async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
+async fn routes(req: Request<Incoming>, compute: &Arc<ComputeNode>) -> Response<Full<Bytes>> {
    //
    // NOTE: The URI path is currently included in traces. That's OK because
    // it doesn't contain any variable parts or sensitive information. But
@@ -48,7 +56,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            info!("serving /status GET request");
            let state = compute.state.lock().unwrap();
            let status_response = status_response_from_state(&state);
-            Response::new(Body::from(serde_json::to_string(&status_response).unwrap()))
+            Response::new(Full::from(serde_json::to_string(&status_response).unwrap()))
        }

        // Startup metrics in JSON format. Keep /metrics reserved for a possible
@@ -56,7 +64,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
        (&Method::GET, "/metrics.json") => {
            info!("serving /metrics.json GET request");
            let metrics = compute.state.lock().unwrap().metrics.clone();
-            Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
+            Response::new(Full::from(serde_json::to_string(&metrics).unwrap()))
        }

        // Collect Postgres current usage insights
@@ -66,11 +74,11 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            if status != ComputeStatus::Running {
                let msg = format!("compute is not running, current status: {:?}", status);
                error!(msg);
-                return Response::new(Body::from(msg));
+                return Response::new(Full::from(msg));
            }

            let insights = compute.collect_insights().await;
-            Response::new(Body::from(insights))
+            Response::new(Full::from(insights))
        }

        (&Method::POST, "/check_writability") => {
@@ -82,15 +90,15 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
                    status
                );
                error!(msg);
-                return Response::new(Body::from(msg));
+                return Response::new(Full::from(msg));
            }

            let res = crate::checker::check_writability(compute).await;
            match res {
-                Ok(_) => Response::new(Body::from("true")),
+                Ok(_) => Response::new(Full::from("true")),
                Err(e) => {
                    error!("check_writability failed: {}", e);
-                    Response::new(Body::from(e.to_string()))
+                    Response::new(Full::from(e.to_string()))
                }
            }
        }
@@ -98,7 +106,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
        (&Method::GET, "/info") => {
            let num_cpus = num_cpus::get_physical();
            info!("serving /info GET request. num_cpus: {}", num_cpus);
-            Response::new(Body::from(
+            Response::new(Full::from(
                serde_json::json!({
                    "num_cpus": num_cpus,
                })
@@ -115,7 +123,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
        (&Method::POST, "/configure") => {
            info!("serving /configure POST request");
            match handle_configure_request(req, compute).await {
-                Ok(msg) => Response::new(Body::from(msg)),
+                Ok(msg) => Response::new(Full::from(msg)),
                Err((msg, code)) => {
                    error!("error handling /configure request: {msg}");
                    render_json_error(&msg, code)
@@ -132,7 +140,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            // if no remote storage is configured
            if compute.ext_remote_storage.is_none() {
                info!("no extensions remote storage configured");
-                let mut resp = Response::new(Body::from("no remote storage configured"));
+                let mut resp = Response::new(Full::from("no remote storage configured"));
                *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
                return resp;
            }
@@ -143,7 +151,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
                if params == "is_library=true" {
                    is_library = true;
                } else {
-                    let mut resp = Response::new(Body::from("Wrong request parameters"));
+                    let mut resp = Response::new(Full::from("Wrong request parameters"));
                    *resp.status_mut() = StatusCode::BAD_REQUEST;
                    return resp;
                }
@@ -165,7 +173,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
                    Some(r) => r,
                    None => {
                        info!("no remote extensions spec was provided");
-                        let mut resp = Response::new(Body::from("no remote storage configured"));
+                        let mut resp = Response::new(Full::from("no remote storage configured"));
                        *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
                        return resp;
                    }
@@ -182,10 +190,10 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            match ext {
                Ok((ext_name, ext_path)) => {
                    match compute.download_extension(ext_name, ext_path).await {
-                        Ok(_) => Response::new(Body::from("OK")),
+                        Ok(_) => Response::new(Full::from("OK")),
                        Err(e) => {
                            error!("extension download failed: {}", e);
-                            let mut resp = Response::new(Body::from(e.to_string()));
+                            let mut resp = Response::new(Full::from(e.to_string()));
                            *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
                            resp
                        }
@@ -193,7 +201,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
                }
                Err(e) => {
                    warn!("extension download failed to find extension: {}", e);
-                    let mut resp = Response::new(Body::from("failed to find file"));
+                    let mut resp = Response::new(Full::from("failed to find file"));
                    *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
                    resp
                }
@@ -202,7 +210,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body

        // Return the `404 Not Found` for any other routes.
        _ => {
-            let mut not_found = Response::new(Body::from("404 Not Found"));
+            let mut not_found = Response::new(Full::from("404 Not Found"));
            *not_found.status_mut() = StatusCode::NOT_FOUND;
            not_found
        }
@@ -210,7 +218,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
 }

 async fn handle_configure_request(
-    req: Request<Body>,
+    req: Request<Incoming>,
    compute: &Arc<ComputeNode>,
 ) -> Result<String, (String, StatusCode)> {
    if !compute.live_config_allowed {
@@ -220,7 +228,7 @@ async fn handle_configure_request(
        ));
    }

-    let body_bytes = hyper::body::to_bytes(req.into_body()).await.unwrap();
+    let body_bytes = req.into_body().collect().await.unwrap().to_bytes();
    let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap();
    if let Ok(request) = serde_json::from_str::<ConfigurationRequest>(&spec_raw) {
        let spec = request.spec;
@@ -287,13 +295,13 @@ async fn handle_configure_request(
    }
 }

-fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
+fn render_json_error(e: &str, status: StatusCode) -> Response<Full<Bytes>> {
    let error = GenericAPIError {
        error: e.to_string(),
    };
    Response::builder()
        .status(status)
-        .body(Body::from(serde_json::to_string(&error).unwrap()))
+        .body(Full::from(serde_json::to_string(&error).unwrap()))
        .unwrap()
 }

@@ -304,35 +312,43 @@ async fn serve(port: u16, state: Arc<ComputeNode>) {
    // see e.g. https://github.com/rust-lang/rust/pull/34440
    let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);

-    let make_service = make_service_fn(move |_conn| {
+    let service = service_fn(move |req: Request<Incoming>| {
        let state = state.clone();
        async move {
-            Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
-                let state = state.clone();
-                async move {
-                    Ok::<_, Infallible>(
-                        // NOTE: We include the URI path in the string. It
-                        // doesn't contain any variable parts or sensitive
-                        // information in this API.
-                        tracing_utils::http::tracing_handler(
-                            req,
-                            |req| routes(req, &state),
-                            OtelName::UriPath,
-                        )
-                        .await,
-                    )
-                }
-            }))
+            Ok::<_, Infallible>(
+                // NOTE: We include the URI path in the string. It
+                // doesn't contain any variable parts or sensitive
+                // information in this API.
+                tracing_utils::http::tracing_handler(
+                    req,
+                    |req| routes(req, &state),
+                    OtelName::UriPath,
+                )
+                .await,
+            )
        }
    });

    info!("starting HTTP server on {}", addr);

-    let server = Server::bind(&addr).serve(make_service);
-
-    // Run this server forever
-    if let Err(e) = server.await {
-        error!("server error: {}", e);
+    let listener = TcpListener::bind(addr).await.unwrap();
+    loop {
+        let (stream, _) = match listener.accept().await {
+            Ok(r) => r,
+            Err(e) => {
+                error!("server error: {}", e);
+                return;
+            }
+        };
+        let io = TokioIo::new(stream);
+        let service = service.clone();
+        tokio::task::spawn(async move {
+            let builder = conn::auto::Builder::new(TokioExecutor::new());
+            let res = builder.serve_connection(io, service).await;
+            if let Err(err) = res {
+                println!("Error serving connection: {:?}", err);
+            }
+        });
    }
 }

--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -3,88 +3,118 @@ use std::{thread, time::Duration};

 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
-use tracing::{debug, info, warn};
+use tracing::{debug, error, info, warn};

 use crate::compute::ComputeNode;
+use compute_api::responses::ComputeStatus;
+use compute_api::spec::ComputeFeature;

 const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);

 // Spin in a loop and figure out the last activity time in the Postgres.
 // Then update it in the shared state. This function never errors out.
-// XXX: the only expected panic is at `RwLock` unwrap().
+// NB: the only expected panic is at `Mutex` unwrap(), all other errors
+// should be handled gracefully.
 fn watch_compute_activity(compute: &ComputeNode) {
    // Suppose that `connstr` doesn't change
    let connstr = compute.connstr.as_str();
+
+    // During startup and configuration we connect to every Postgres database,
+    // but we don't want to count this as some user activity. So wait until
+    // the compute fully started before monitoring activity.
+    wait_for_postgres_start(compute);
+
    // Define `client` outside of the loop to reuse existing connection if it's active.
    let mut client = Client::connect(connstr, NoTls);

-    info!("watching Postgres activity at {}", connstr);
+    let mut sleep = false;
+    let mut prev_active_time: Option<f64> = None;
+    let mut prev_sessions: Option<i64> = None;
+
+    if compute.has_feature(ComputeFeature::ActivityMonitorExperimental) {
+        info!("starting experimental activity monitor for {}", connstr);
+    } else {
+        info!("starting activity monitor for {}", connstr);
+    }

    loop {
-        // Should be outside of the write lock to allow others to read while we sleep.
-        thread::sleep(MONITOR_CHECK_INTERVAL);
+        // We use `continue` a lot, so it's more convenient to sleep at the top of the loop.
+        // But skip the first sleep, so we can connect to Postgres immediately.
+        if sleep {
+            // Should be outside of the mutex lock to allow others to read while we sleep.
+            thread::sleep(MONITOR_CHECK_INTERVAL);
+        } else {
+            sleep = true;
+        }

        match &mut client {
            Ok(cli) => {
                if cli.is_closed() {
-                    info!("connection to postgres closed, trying to reconnect");
+                    info!("connection to Postgres is closed, trying to reconnect");

                    // Connection is closed, reconnect and try again.
                    client = Client::connect(connstr, NoTls);
                    continue;
                }

-                // Get all running client backends except ourself, use RFC3339 DateTime format.
-                let backends = cli
-                    .query(
-                        "SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change
-                         FROM pg_stat_activity
-                         WHERE backend_type = 'client backend'
-                            AND pid != pg_backend_pid()
-                            AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
-                        &[],
-                    );
-                let mut last_active = compute.state.lock().unwrap().last_active;
+                // This is a new logic, only enable if the feature flag is set.
+                // TODO: remove this once we are sure that it works OR drop it altogether.
+                if compute.has_feature(ComputeFeature::ActivityMonitorExperimental) {
+                    // First, check if the total active time or sessions across all databases has changed.
+                    // If it did, it means that user executed some queries. In theory, it can even go down if
+                    // some databases were dropped, but it's still a user activity.
+                    match get_database_stats(cli) {
+                        Ok((active_time, sessions)) => {
+                            let mut detected_activity = false;

-                if let Ok(backs) = backends {
-                    let mut idle_backs: Vec<DateTime<Utc>> = vec![];
-
-                    for b in backs.into_iter() {
-                        let state: String = match b.try_get("state") {
-                            Ok(state) => state,
-                            Err(_) => continue,
-                        };
-
-                        if state == "idle" {
-                            let change: String = match b.try_get("state_change") {
-                                Ok(state_change) => state_change,
-                                Err(_) => continue,
-                            };
-                            let change = DateTime::parse_from_rfc3339(&change);
-                            match change {
-                                Ok(t) => idle_backs.push(t.with_timezone(&Utc)),
-                                Err(e) => {
-                                    info!("cannot parse backend state_change DateTime: {}", e);
-                                    continue;
+                            prev_active_time = match prev_active_time {
+                                Some(prev_active_time) => {
+                                    if active_time != prev_active_time {
+                                        detected_activity = true;
+                                    }
+                                    Some(active_time)
                                }
-                            }
-                        } else {
-                            // Found non-idle backend, so the last activity is NOW.
-                            // Save it and exit the for loop. Also clear the idle backend
-                            // `state_change` timestamps array as it doesn't matter now.
-                            last_active = Some(Utc::now());
-                            idle_backs.clear();
-                            break;
-                        }
-                    }
+                                None => Some(active_time),
+                            };
+                            prev_sessions = match prev_sessions {
+                                Some(prev_sessions) => {
+                                    if sessions != prev_sessions {
+                                        detected_activity = true;
+                                    }
+                                    Some(sessions)
+                                }
+                                None => Some(sessions),
+                            };

-                    // Get idle backend `state_change` with the max timestamp.
-                    if let Some(last) = idle_backs.iter().max() {
-                        last_active = Some(*last);
+                            if detected_activity {
+                                // Update the last active time and continue, we don't need to
+                                // check backends state change.
+                                compute.update_last_active(Some(Utc::now()));
+                                continue;
+                            }
+                        }
+                        Err(e) => {
+                            error!("could not get database statistics: {}", e);
+                            continue;
+                        }
                    }
                }

-                // If there are existing (logical) walsenders, do not suspend.
+                // Second, if database statistics is the same, check all backends state change,
+                // maybe there is some with more recent activity. `get_backends_state_change()`
+                // can return None or stale timestamp, so it's `compute.update_last_active()`
+                // responsibility to check if the new timestamp is more recent than the current one.
+                // This helps us to discover new sessions, that did nothing yet.
+                match get_backends_state_change(cli) {
+                    Ok(last_active) => {
+                        compute.update_last_active(last_active);
+                    }
+                    Err(e) => {
+                        error!("could not get backends state change: {}", e);
+                    }
+                }
+
+                // Finally, if there are existing (logical) walsenders, do not suspend.
                //
                // walproposer doesn't currently show up in pg_stat_replication,
                // but protect if it will be
@@ -93,11 +123,12 @@ fn watch_compute_activity(compute: &ComputeNode) {
                    Ok(r) => match r.try_get::<&str, i64>("count") {
                        Ok(num_ws) => {
                            if num_ws > 0 {
-                                last_active = Some(Utc::now());
+                                compute.update_last_active(Some(Utc::now()));
+                                continue;
                            }
                        }
                        Err(e) => {
-                            warn!("failed to parse ws count: {:?}", e);
+                            warn!("failed to parse walsenders count: {:?}", e);
                            continue;
                        }
                    },
@@ -106,17 +137,31 @@ fn watch_compute_activity(compute: &ComputeNode) {
                        continue;
                    }
                }
-
-                // Update the last activity in the shared state if we got a more recent one.
-                let mut state = compute.state.lock().unwrap();
-                // NB: `Some(<DateTime>)` is always greater than `None`.
-                if last_active > state.last_active {
-                    state.last_active = last_active;
-                    debug!("set the last compute activity time to: {:?}", last_active);
+                //
+                // Do not suspend compute if autovacuum is running
+                //
+                let autovacuum_count_query = "select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'";
+                match cli.query_one(autovacuum_count_query, &[]) {
+                    Ok(r) => match r.try_get::<&str, i64>("count") {
+                        Ok(num_workers) => {
+                            if num_workers > 0 {
+                                compute.update_last_active(Some(Utc::now()));
+                                continue;
+                            }
+                        }
+                        Err(e) => {
+                            warn!("failed to parse autovacuum workers count: {:?}", e);
+                            continue;
+                        }
+                    },
+                    Err(e) => {
+                        warn!("failed to get list of autovacuum workers: {:?}", e);
+                        continue;
+                    }
                }
            }
            Err(e) => {
-                debug!("cannot connect to postgres: {}, retrying", e);
+                debug!("could not connect to Postgres: {}, retrying", e);

                // Establish a new connection and try again.
                client = Client::connect(connstr, NoTls);
@@ -125,12 +170,124 @@ fn watch_compute_activity(compute: &ComputeNode) {
    }
 }

+// Hang on condition variable waiting until the compute status is `Running`.
+fn wait_for_postgres_start(compute: &ComputeNode) {
+    let mut state = compute.state.lock().unwrap();
+    while state.status != ComputeStatus::Running {
+        info!("compute is not running, waiting before monitoring activity");
+        state = compute.state_changed.wait(state).unwrap();
+
+        if state.status == ComputeStatus::Running {
+            break;
+        }
+    }
+}
+
+// Figure out the total active time and sessions across all non-system databases.
+// Returned tuple is `(active_time, sessions)`.
+// It can return `0.0` active time or `0` sessions, which means no user databases exist OR
+// it was a start with skipped `pg_catalog` updates and user didn't do any queries
+// (or open any sessions) yet.
+fn get_database_stats(cli: &mut Client) -> anyhow::Result<(f64, i64)> {
+    // Filter out `postgres` database as `compute_ctl` and other monitoring tools
+    // like `postgres_exporter` use it to query Postgres statistics.
+    // Use explicit 8 bytes type casts to match Rust types.
+    let stats = cli.query_one(
+        "SELECT coalesce(sum(active_time), 0.0)::float8 AS total_active_time,
+            coalesce(sum(sessions), 0)::bigint AS total_sessions
+        FROM pg_stat_database
+        WHERE datname NOT IN (
+                'postgres',
+                'template0',
+                'template1'
+            );",
+        &[],
+    );
+    let stats = match stats {
+        Ok(stats) => stats,
+        Err(e) => {
+            return Err(anyhow::anyhow!("could not query active_time: {}", e));
+        }
+    };
+
+    let active_time: f64 = match stats.try_get("total_active_time") {
+        Ok(active_time) => active_time,
+        Err(e) => return Err(anyhow::anyhow!("could not get total_active_time: {}", e)),
+    };
+
+    let sessions: i64 = match stats.try_get("total_sessions") {
+        Ok(sessions) => sessions,
+        Err(e) => return Err(anyhow::anyhow!("could not get total_sessions: {}", e)),
+    };
+
+    Ok((active_time, sessions))
+}
+
+// Figure out the most recent state change time across all client backends.
+// If there is currently active backend, timestamp will be `Utc::now()`.
+// It can return `None`, which means no client backends exist or we were
+// unable to parse the timestamp.
+fn get_backends_state_change(cli: &mut Client) -> anyhow::Result<Option<DateTime<Utc>>> {
+    let mut last_active: Option<DateTime<Utc>> = None;
+    // Get all running client backends except ourself, use RFC3339 DateTime format.
+    let backends = cli.query(
+        "SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change
+                FROM pg_stat_activity
+                    WHERE backend_type = 'client backend'
+                    AND pid != pg_backend_pid()
+                    AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
+        &[],
+    );
+
+    match backends {
+        Ok(backs) => {
+            let mut idle_backs: Vec<DateTime<Utc>> = vec![];
+
+            for b in backs.into_iter() {
+                let state: String = match b.try_get("state") {
+                    Ok(state) => state,
+                    Err(_) => continue,
+                };
+
+                if state == "idle" {
+                    let change: String = match b.try_get("state_change") {
+                        Ok(state_change) => state_change,
+                        Err(_) => continue,
+                    };
+                    let change = DateTime::parse_from_rfc3339(&change);
+                    match change {
+                        Ok(t) => idle_backs.push(t.with_timezone(&Utc)),
+                        Err(e) => {
+                            info!("cannot parse backend state_change DateTime: {}", e);
+                            continue;
+                        }
+                    }
+                } else {
+                    // Found non-idle backend, so the last activity is NOW.
+                    // Return immediately, no need to check other backends.
+                    return Ok(Some(Utc::now()));
+                }
+            }
+
+            // Get idle backend `state_change` with the max timestamp.
+            if let Some(last) = idle_backs.iter().max() {
+                last_active = Some(*last);
+            }
+        }
+        Err(e) => {
+            return Err(anyhow::anyhow!("could not query backends: {}", e));
+        }
+    }
+
+    Ok(last_active)
+}
+
 /// Launch a separate compute monitor thread and return its `JoinHandle`.
-pub fn launch_monitor(state: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
-    let state = Arc::clone(state);
+pub fn launch_monitor(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
+    let compute = Arc::clone(compute);

    thread::Builder::new()
        .name("compute-monitor".into())
-        .spawn(move || watch_compute_activity(&state))
+        .spawn(move || watch_compute_activity(&compute))
        .expect("cannot launch compute monitor thread")
 }
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -456,7 +456,7 @@ pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()>
 /// - no new lines were written for the last second
 async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> {
    let mut lines = tokio::io::BufReader::new(stderr).lines();
-    let timeout_duration = Duration::from_secs(1);
+    let timeout_duration = Duration::from_millis(100);
    let ts_regex =
        regex::Regex::new(r"^\d+-\d{2}-\d{2} \d{2}:\d{2}:\d{2}").expect("regex is valid");

--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -190,18 +190,20 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {

    // Print a list of existing Postgres roles (only in debug mode)
    if span_enabled!(Level::INFO) {
-        info!("postgres roles:");
+        let mut vec = Vec::new();
        for r in &existing_roles {
-            info!(
-                "    - {}:{}",
+            vec.push(format!(
+                "{}:{}",
                r.name,
                if r.encrypted_password.is_some() {
                    "[FILTERED]"
                } else {
                    "(null)"
                }
-            );
+            ));
        }
+
+        info!("postgres roles (total {}): {:?}", vec.len(), vec);
    }

    // Process delta operations first
@@ -239,7 +241,10 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    // Refresh Postgres roles info to handle possible roles renaming
    let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;

-    info!("cluster spec roles:");
+    info!(
+        "handling cluster spec roles (total {})",
+        spec.cluster.roles.len()
+    );
    for role in &spec.cluster.roles {
        let name = &role.name;
        // XXX: with a limited number of roles it is fine, but consider making it a HashMap
@@ -302,7 +307,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
                    name.pg_quote()
                );
-                info!("role create query: '{}'", &query);
+                info!("running role create query: '{}'", &query);
                query.push_str(&role.to_pg_options());
                xact.execute(query.as_str(), &[])?;
            }
@@ -319,7 +324,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                RoleAction::Create => " -> create",
                RoleAction::Update => " -> update",
            };
-            info!("   - {}:{}{}", name, pwd, action_str);
+            info!(" - {}:{}{}", name, pwd, action_str);
        }
    }

@@ -428,10 +433,11 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {

    // Print a list of existing Postgres databases (only in debug mode)
    if span_enabled!(Level::INFO) {
-        info!("postgres databases:");
+        let mut vec = Vec::new();
        for (dbname, db) in &existing_dbs {
-            info!("    {}:{}", dbname, db.owner);
+            vec.push(format!("{}:{}", dbname, db.owner));
        }
+        info!("postgres databases (total {}): {:?}", vec.len(), vec);
    }

    // Process delta operations first
@@ -503,7 +509,10 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    // Refresh Postgres databases info to handle possible renames
    let existing_dbs = get_existing_dbs(client)?;

-    info!("cluster spec databases:");
+    info!(
+        "handling cluster spec databases (total {})",
+        spec.cluster.databases.len()
+    );
    for db in &spec.cluster.databases {
        let name = &db.name;
        let pg_db = existing_dbs.get(name);
@@ -562,7 +571,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                DatabaseAction::Create => " -> create",
                DatabaseAction::Update => " -> update",
            };
-            info!("   - {}:{}{}", db.name, db.owner, action_str);
+            info!(" - {}:{}{}", db.name, db.owner, action_str);
        }
    }

--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -0,0 +1,32 @@
+[package]
+name = "attachment_service"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+camino.workspace = true
+clap.workspace = true
+futures.workspace = true
+git-version.workspace = true
+hyper.workspace = true
+pageserver_api.workspace = true
+pageserver_client.workspace = true
+postgres_connection.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+thiserror.workspace = true
+tokio.workspace = true
+tokio-util.workspace = true
+tracing.workspace = true
+
+# TODO: remove this after DB persistence is added, it is only used for
+# a parsing function when loading pageservers from neon_local LocalEnv
+postgres_backend.workspace = true
+
+utils = { path = "../../libs/utils/" }
+metrics = { path = "../../libs/metrics/" }
+control_plane = { path = ".." }
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -0,0 +1,116 @@
+use std::collections::HashMap;
+
+use control_plane::endpoint::ComputeControlPlane;
+use control_plane::local_env::LocalEnv;
+use pageserver_api::shard::{ShardCount, ShardIndex, TenantShardId};
+use postgres_connection::parse_host_port;
+use utils::id::{NodeId, TenantId};
+
+pub(super) struct ComputeHookTenant {
+    shards: Vec<(ShardIndex, NodeId)>,
+}
+
+impl ComputeHookTenant {
+    pub(super) async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> anyhow::Result<()> {
+        // Find the highest shard count and drop any shards that aren't
+        // for that shard count.
+        let shard_count = self.shards.iter().map(|(k, _v)| k.shard_count).max();
+        let Some(shard_count) = shard_count else {
+            // No shards, nothing to do.
+            tracing::info!("ComputeHookTenant::maybe_reconfigure: no shards");
+            return Ok(());
+        };
+
+        self.shards.retain(|(k, _v)| k.shard_count == shard_count);
+        self.shards
+            .sort_by_key(|(shard, _node_id)| shard.shard_number);
+
+        if self.shards.len() == shard_count.0 as usize || shard_count == ShardCount(0) {
+            // We have pageservers for all the shards: proceed to reconfigure compute
+            let env = match LocalEnv::load_config() {
+                Ok(e) => e,
+                Err(e) => {
+                    tracing::warn!(
+                        "Couldn't load neon_local config, skipping compute update ({e})"
+                    );
+                    return Ok(());
+                }
+            };
+            let cplane = ComputeControlPlane::load(env.clone())
+                .expect("Error loading compute control plane");
+
+            let compute_pageservers = self
+                .shards
+                .iter()
+                .map(|(_shard, node_id)| {
+                    let ps_conf = env
+                        .get_pageserver_conf(*node_id)
+                        .expect("Unknown pageserver");
+                    let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr)
+                        .expect("Unable to parse listen_pg_addr");
+                    (pg_host, pg_port.unwrap_or(5432))
+                })
+                .collect::<Vec<_>>();
+
+            for (endpoint_name, endpoint) in &cplane.endpoints {
+                if endpoint.tenant_id == tenant_id && endpoint.status() == "running" {
+                    tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,);
+                    endpoint.reconfigure(compute_pageservers.clone()).await?;
+                }
+            }
+        } else {
+            tracing::info!(
+                "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
+                self.shards.len(),
+                shard_count.0
+            );
+        }
+
+        Ok(())
+    }
+}
+
+/// The compute hook is a destination for notifications about changes to tenant:pageserver
+/// mapping.  It aggregates updates for the shards in a tenant, and when appropriate reconfigures
+/// the compute connection string.
+pub(super) struct ComputeHook {
+    state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
+}
+
+impl ComputeHook {
+    pub(super) fn new() -> Self {
+        Self {
+            state: Default::default(),
+        }
+    }
+
+    pub(super) async fn notify(
+        &self,
+        tenant_shard_id: TenantShardId,
+        node_id: NodeId,
+    ) -> anyhow::Result<()> {
+        tracing::info!("ComputeHook::notify: {}->{}", tenant_shard_id, node_id);
+        let mut locked = self.state.lock().await;
+        let entry = locked
+            .entry(tenant_shard_id.tenant_id)
+            .or_insert_with(|| ComputeHookTenant { shards: Vec::new() });
+
+        let shard_index = ShardIndex {
+            shard_count: tenant_shard_id.shard_count,
+            shard_number: tenant_shard_id.shard_number,
+        };
+
+        let mut set = false;
+        for (existing_shard, existing_node) in &mut entry.shards {
+            if *existing_shard == shard_index {
+                *existing_node = node_id;
+                set = true;
+            }
+        }
+        if !set {
+            entry.shards.push((shard_index, node_id));
+        }
+
+        entry.maybe_reconfigure(tenant_shard_id.tenant_id).await
+    }
+}
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -0,0 +1,218 @@
+use crate::reconciler::ReconcileError;
+use crate::service::Service;
+use hyper::{Body, Request, Response};
+use hyper::{StatusCode, Uri};
+use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest};
+use pageserver_api::shard::TenantShardId;
+use std::sync::Arc;
+use utils::auth::SwappableJwtAuth;
+use utils::http::endpoint::{auth_middleware, request_span};
+use utils::http::request::parse_request_param;
+use utils::id::TenantId;
+
+use utils::{
+    http::{
+        endpoint::{self},
+        error::ApiError,
+        json::{json_request, json_response},
+        RequestExt, RouterBuilder,
+    },
+    id::NodeId,
+};
+
+use pageserver_api::control_api::{ReAttachRequest, ValidateRequest};
+
+use control_plane::attachment_service::{
+    AttachHookRequest, InspectRequest, NodeConfigureRequest, NodeRegisterRequest,
+    TenantShardMigrateRequest,
+};
+
+/// State available to HTTP request handlers
+#[derive(Clone)]
+pub struct HttpState {
+    service: Arc<crate::service::Service>,
+    auth: Option<Arc<SwappableJwtAuth>>,
+    allowlist_routes: Vec<Uri>,
+}
+
+impl HttpState {
+    pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
+        let allowlist_routes = ["/status"]
+            .iter()
+            .map(|v| v.parse().unwrap())
+            .collect::<Vec<_>>();
+        Self {
+            service,
+            auth,
+            allowlist_routes,
+        }
+    }
+}
+
+#[inline(always)]
+fn get_state(request: &Request<Body>) -> &HttpState {
+    request
+        .data::<Arc<HttpState>>()
+        .expect("unknown state type")
+        .as_ref()
+}
+
+/// Pageserver calls into this on startup, to learn which tenants it should attach
+async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
+    let state = get_state(&req);
+    json_response(
+        StatusCode::OK,
+        state
+            .service
+            .re_attach(reattach_req)
+            .await
+            .map_err(ApiError::InternalServerError)?,
+    )
+}
+
+/// Pageserver calls into this before doing deletions, to confirm that it still
+/// holds the latest generation for the tenants with deletions enqueued
+async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let validate_req = json_request::<ValidateRequest>(&mut req).await?;
+    let state = get_state(&req);
+    json_response(StatusCode::OK, state.service.validate(validate_req))
+}
+
+/// Call into this before attaching a tenant to a pageserver, to acquire a generation number
+/// (in the real control plane this is unnecessary, because the same program is managing
+///  generation numbers and doing attachments).
+async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
+    let state = get_state(&req);
+
+    json_response(
+        StatusCode::OK,
+        state
+            .service
+            .attach_hook(attach_req)
+            .await
+            .map_err(ApiError::InternalServerError)?,
+    )
+}
+
+async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let inspect_req = json_request::<InspectRequest>(&mut req).await?;
+
+    let state = get_state(&req);
+
+    json_response(StatusCode::OK, state.service.inspect(inspect_req))
+}
+
+async fn handle_tenant_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
+    let state = get_state(&req);
+    json_response(
+        StatusCode::OK,
+        state.service.tenant_create(create_req).await?,
+    )
+}
+
+async fn handle_tenant_timeline_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
+
+    let state = get_state(&req);
+    json_response(
+        StatusCode::OK,
+        state
+            .service
+            .tenant_timeline_create(tenant_id, create_req)
+            .await?,
+    )
+}
+
+async fn handle_tenant_locate(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let state = get_state(&req);
+
+    json_response(StatusCode::OK, state.service.tenant_locate(tenant_id)?)
+}
+
+async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
+    let state = get_state(&req);
+    state.service.node_register(register_req).await?;
+    json_response(StatusCode::OK, ())
+}
+
+async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+    let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
+    if node_id != config_req.node_id {
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "Path and body node_id differ"
+        )));
+    }
+    let state = get_state(&req);
+
+    json_response(StatusCode::OK, state.service.node_configure(config_req)?)
+}
+
+async fn handle_tenant_shard_migrate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
+    let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
+    let state = get_state(&req);
+    json_response(
+        StatusCode::OK,
+        state
+            .service
+            .tenant_shard_migrate(tenant_shard_id, migrate_req)
+            .await?,
+    )
+}
+
+/// Status endpoint is just used for checking that our HTTP listener is up
+async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    json_response(StatusCode::OK, ())
+}
+
+impl From<ReconcileError> for ApiError {
+    fn from(value: ReconcileError) -> Self {
+        ApiError::Conflict(format!("Reconciliation error: {}", value))
+    }
+}
+
+pub fn make_router(
+    service: Arc<Service>,
+    auth: Option<Arc<SwappableJwtAuth>>,
+) -> RouterBuilder<hyper::Body, ApiError> {
+    let mut router = endpoint::make_router();
+    if auth.is_some() {
+        router = router.middleware(auth_middleware(|request| {
+            let state = get_state(request);
+            if state.allowlist_routes.contains(request.uri()) {
+                None
+            } else {
+                state.auth.as_deref()
+            }
+        }))
+    }
+
+    router
+        .data(Arc::new(HttpState::new(service, auth)))
+        .get("/status", |r| request_span(r, handle_status))
+        .post("/re-attach", |r| request_span(r, handle_re_attach))
+        .post("/validate", |r| request_span(r, handle_validate))
+        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
+        .post("/inspect", |r| request_span(r, handle_inspect))
+        .post("/node", |r| request_span(r, handle_node_register))
+        .put("/node/:node_id/config", |r| {
+            request_span(r, handle_node_configure)
+        })
+        .post("/tenant", |r| request_span(r, handle_tenant_create))
+        .post("/tenant/:tenant_id/timeline", |r| {
+            request_span(r, handle_tenant_timeline_create)
+        })
+        .get("/tenant/:tenant_id/locate", |r| {
+            request_span(r, handle_tenant_locate)
+        })
+        .put("/tenant/:tenant_shard_id/migrate", |r| {
+            request_span(r, handle_tenant_shard_migrate)
+        })
+}
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -0,0 +1,57 @@
+use serde::{Deserialize, Serialize};
+use utils::seqwait::MonotonicCounter;
+
+mod compute_hook;
+pub mod http;
+mod node;
+pub mod persistence;
+mod reconciler;
+mod scheduler;
+pub mod service;
+mod tenant_state;
+
+#[derive(Clone, Serialize, Deserialize)]
+enum PlacementPolicy {
+    /// Cheapest way to attach a tenant: just one pageserver, no secondary
+    Single,
+    /// Production-ready way to attach a tenant: one attached pageserver and
+    /// some number of secondaries.
+    Double(usize),
+}
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
+struct Sequence(u64);
+
+impl Sequence {
+    fn initial() -> Self {
+        Self(0)
+    }
+}
+
+impl std::fmt::Display for Sequence {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl MonotonicCounter<Sequence> for Sequence {
+    fn cnt_advance(&mut self, v: Sequence) {
+        assert!(*self <= v);
+        *self = v;
+    }
+    fn cnt_value(&self) -> Sequence {
+        *self
+    }
+}
+
+impl Sequence {
+    fn next(&self) -> Sequence {
+        Sequence(self.0 + 1)
+    }
+}
+
+impl Default for PlacementPolicy {
+    fn default() -> Self {
+        PlacementPolicy::Double(1)
+    }
+}
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -0,0 +1,100 @@
+/// The attachment service mimics the aspects of the control plane API
+/// that are required for a pageserver to operate.
+///
+/// This enables running & testing pageservers without a full-blown
+/// deployment of the Neon cloud platform.
+///
+use anyhow::anyhow;
+use attachment_service::http::make_router;
+use attachment_service::persistence::Persistence;
+use attachment_service::service::{Config, Service};
+use camino::Utf8PathBuf;
+use clap::Parser;
+use metrics::launch_timestamp::LaunchTimestamp;
+use std::sync::Arc;
+use utils::auth::{JwtAuth, SwappableJwtAuth};
+use utils::logging::{self, LogFormat};
+use utils::signals::{ShutdownSignals, Signal};
+
+use utils::{project_build_tag, project_git_version, tcp_listener};
+
+project_git_version!(GIT_VERSION);
+project_build_tag!(BUILD_TAG);
+
+#[derive(Parser)]
+#[command(author, version, about, long_about = None)]
+#[command(arg_required_else_help(true))]
+struct Cli {
+    /// Host and port to listen on, like `127.0.0.1:1234`
+    #[arg(short, long)]
+    listen: std::net::SocketAddr,
+
+    /// Path to public key for JWT authentication of clients
+    #[arg(long)]
+    public_key: Option<camino::Utf8PathBuf>,
+
+    /// Token for authenticating this service with the pageservers it controls
+    #[arg(short, long)]
+    jwt_token: Option<String>,
+
+    /// Path to the .json file to store state (will be created if it doesn't exist)
+    #[arg(short, long)]
+    path: Utf8PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate()));
+
+    logging::init(
+        LogFormat::Plain,
+        logging::TracingErrorLayerEnablement::Disabled,
+        logging::Output::Stdout,
+    )?;
+
+    let args = Cli::parse();
+    tracing::info!(
+        "version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}",
+        GIT_VERSION,
+        launch_ts.to_string(),
+        BUILD_TAG,
+        args.path,
+        args.listen
+    );
+
+    let config = Config {
+        jwt_token: args.jwt_token,
+    };
+
+    let persistence = Arc::new(Persistence::new(&args.path).await);
+
+    let service = Service::spawn(config, persistence).await?;
+
+    let http_listener = tcp_listener::bind(args.listen)?;
+
+    let auth = if let Some(public_key_path) = &args.public_key {
+        let jwt_auth = JwtAuth::from_key_path(public_key_path)?;
+        Some(Arc::new(SwappableJwtAuth::new(jwt_auth)))
+    } else {
+        None
+    };
+    let router = make_router(service, auth)
+        .build()
+        .map_err(|err| anyhow!(err))?;
+    let service = utils::http::RouterService::new(router).unwrap();
+    let server = hyper::Server::from_tcp(http_listener)?.serve(service);
+
+    tracing::info!("Serving on {0}", args.listen);
+
+    tokio::task::spawn(server);
+
+    ShutdownSignals::handle(|signal| match signal {
+        Signal::Interrupt | Signal::Terminate | Signal::Quit => {
+            tracing::info!("Got {}. Terminating", signal.name());
+            // We're just a test helper: no graceful shutdown.
+            std::process::exit(0);
+        }
+    })?;
+
+    Ok(())
+}
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -0,0 +1,37 @@
+use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+use utils::id::NodeId;
+
+#[derive(Clone)]
+pub(crate) struct Node {
+    pub(crate) id: NodeId,
+
+    pub(crate) availability: NodeAvailability,
+    pub(crate) scheduling: NodeSchedulingPolicy,
+
+    pub(crate) listen_http_addr: String,
+    pub(crate) listen_http_port: u16,
+
+    pub(crate) listen_pg_addr: String,
+    pub(crate) listen_pg_port: u16,
+}
+
+impl Node {
+    pub(crate) fn base_url(&self) -> String {
+        format!("http://{}:{}", self.listen_http_addr, self.listen_http_port)
+    }
+
+    /// Is this node elegible to have work scheduled onto it?
+    pub(crate) fn may_schedule(&self) -> bool {
+        match self.availability {
+            NodeAvailability::Active => {}
+            NodeAvailability::Offline => return false,
+        }
+
+        match self.scheduling {
+            NodeSchedulingPolicy::Active => true,
+            NodeSchedulingPolicy::Draining => false,
+            NodeSchedulingPolicy::Filling => true,
+            NodeSchedulingPolicy::Pause => false,
+        }
+    }
+}
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -0,0 +1,272 @@
+use std::{collections::HashMap, str::FromStr};
+
+use camino::{Utf8Path, Utf8PathBuf};
+use control_plane::{
+    attachment_service::{NodeAvailability, NodeSchedulingPolicy},
+    local_env::LocalEnv,
+};
+use pageserver_api::{
+    models::TenantConfig,
+    shard::{ShardCount, ShardNumber, TenantShardId},
+};
+use postgres_connection::parse_host_port;
+use serde::{Deserialize, Serialize};
+use utils::{
+    generation::Generation,
+    id::{NodeId, TenantId},
+};
+
+use crate::{node::Node, PlacementPolicy};
+
+/// Placeholder for storage.  This will be replaced with a database client.
+pub struct Persistence {
+    state: std::sync::Mutex<PersistentState>,
+}
+
+// Top level state available to all HTTP handlers
+#[derive(Serialize, Deserialize)]
+struct PersistentState {
+    tenants: HashMap<TenantShardId, TenantShardPersistence>,
+
+    #[serde(skip)]
+    path: Utf8PathBuf,
+}
+
+/// A convenience for serializing the state inside a sync lock, and then
+/// writing it to disk outside of the lock.  This will go away when switching
+/// to a database backend.
+struct PendingWrite {
+    bytes: Vec<u8>,
+    path: Utf8PathBuf,
+}
+
+impl PendingWrite {
+    async fn commit(&self) -> anyhow::Result<()> {
+        tokio::fs::write(&self.path, &self.bytes).await?;
+
+        Ok(())
+    }
+}
+
+impl PersistentState {
+    fn save(&self) -> PendingWrite {
+        PendingWrite {
+            bytes: serde_json::to_vec(self).expect("Serialization error"),
+            path: self.path.clone(),
+        }
+    }
+
+    async fn load(path: &Utf8Path) -> anyhow::Result<Self> {
+        let bytes = tokio::fs::read(path).await?;
+        let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
+        decoded.path = path.to_owned();
+
+        for (tenant_id, tenant) in &mut decoded.tenants {
+            // Backward compat: an old attachments.json from before PR #6251, replace
+            // empty strings with proper defaults.
+            if tenant.tenant_id.is_empty() {
+                tenant.tenant_id = format!("{}", tenant_id);
+                tenant.config = serde_json::to_string(&TenantConfig::default())?;
+                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())?;
+            }
+        }
+
+        Ok(decoded)
+    }
+
+    async fn load_or_new(path: &Utf8Path) -> Self {
+        match Self::load(path).await {
+            Ok(s) => {
+                tracing::info!("Loaded state file at {}", path);
+                s
+            }
+            Err(e)
+                if e.downcast_ref::<std::io::Error>()
+                    .map(|e| e.kind() == std::io::ErrorKind::NotFound)
+                    .unwrap_or(false) =>
+            {
+                tracing::info!("Will create state file at {}", path);
+                Self {
+                    tenants: HashMap::new(),
+                    path: path.to_owned(),
+                }
+            }
+            Err(e) => {
+                panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path)
+            }
+        }
+    }
+}
+
+impl Persistence {
+    pub async fn new(path: &Utf8Path) -> Self {
+        let state = PersistentState::load_or_new(path).await;
+        Self {
+            state: std::sync::Mutex::new(state),
+        }
+    }
+
+    /// When registering a node, persist it so that on next start we will be able to
+    /// iterate over known nodes to synchronize their tenant shard states with our observed state.
+    pub(crate) async fn insert_node(&self, _node: &Node) -> anyhow::Result<()> {
+        // TODO: node persitence will come with database backend
+        Ok(())
+    }
+
+    /// At startup, we populate the service's list of nodes, and use this list to call into
+    /// each node to do an initial reconciliation of the state of the world with our in-memory
+    /// observed state.
+    pub(crate) async fn list_nodes(&self) -> anyhow::Result<Vec<Node>> {
+        let env = LocalEnv::load_config()?;
+        // TODO: node persitence will come with database backend
+
+        // XXX hack: enable test_backward_compatibility to work by populating our list of
+        // nodes from LocalEnv when it is not present in persistent storage.  Otherwise at
+        // first startup in the compat test, we may have shards but no nodes.
+        let mut result = Vec::new();
+        tracing::info!(
+            "Loaded {} pageserver nodes from LocalEnv",
+            env.pageservers.len()
+        );
+        for ps_conf in env.pageservers {
+            let (pg_host, pg_port) =
+                parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
+            let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
+                .expect("Unable to parse listen_http_addr");
+            result.push(Node {
+                id: ps_conf.id,
+                listen_pg_addr: pg_host.to_string(),
+                listen_pg_port: pg_port.unwrap_or(5432),
+                listen_http_addr: http_host.to_string(),
+                listen_http_port: http_port.unwrap_or(80),
+                availability: NodeAvailability::Active,
+                scheduling: NodeSchedulingPolicy::Active,
+            });
+        }
+
+        Ok(result)
+    }
+
+    /// At startup, we populate our map of tenant shards from persistent storage.
+    pub(crate) async fn list_tenant_shards(&self) -> anyhow::Result<Vec<TenantShardPersistence>> {
+        let locked = self.state.lock().unwrap();
+        Ok(locked.tenants.values().cloned().collect())
+    }
+
+    /// Tenants must be persisted before we schedule them for the first time.  This enables us
+    /// to correctly retain generation monotonicity, and the externally provided placement policy & config.
+    pub(crate) async fn insert_tenant_shards(
+        &self,
+        shards: Vec<TenantShardPersistence>,
+    ) -> anyhow::Result<()> {
+        let write = {
+            let mut locked = self.state.lock().unwrap();
+            for shard in shards {
+                let tenant_shard_id = TenantShardId {
+                    tenant_id: TenantId::from_str(shard.tenant_id.as_str())?,
+                    shard_number: ShardNumber(shard.shard_number as u8),
+                    shard_count: ShardCount(shard.shard_count as u8),
+                };
+
+                locked.tenants.insert(tenant_shard_id, shard);
+            }
+            locked.save()
+        };
+
+        write.commit().await?;
+
+        Ok(())
+    }
+
+    /// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically
+    /// advancing generation number.  We also store the NodeId for which the generation was issued, so that in
+    /// [`Self::re_attach`] we can do a bulk UPDATE on the generations for that node.
+    pub(crate) async fn increment_generation(
+        &self,
+        tenant_shard_id: TenantShardId,
+        node_id: Option<NodeId>,
+    ) -> anyhow::Result<Generation> {
+        let (write, gen) = {
+            let mut locked = self.state.lock().unwrap();
+            let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
+                anyhow::bail!("Tried to increment generation of unknown shard");
+            };
+
+            // If we're called with a None pageserver, we need only update the generation
+            // record to disassociate it with this pageserver, not actually increment the number, as
+            // the increment is guaranteed to happen the next time this tenant is attached.
+            if node_id.is_some() {
+                shard.generation += 1;
+            }
+
+            shard.generation_pageserver = node_id;
+            let gen = Generation::new(shard.generation);
+            (locked.save(), gen)
+        };
+
+        write.commit().await?;
+        Ok(gen)
+    }
+
+    pub(crate) async fn re_attach(
+        &self,
+        node_id: NodeId,
+    ) -> anyhow::Result<HashMap<TenantShardId, Generation>> {
+        let (write, result) = {
+            let mut result = HashMap::new();
+            let mut locked = self.state.lock().unwrap();
+            for (tenant_shard_id, shard) in locked.tenants.iter_mut() {
+                if shard.generation_pageserver == Some(node_id) {
+                    shard.generation += 1;
+                    result.insert(*tenant_shard_id, Generation::new(shard.generation));
+                }
+            }
+
+            (locked.save(), result)
+        };
+
+        write.commit().await?;
+        Ok(result)
+    }
+
+    // TODO: when we start shard splitting, we must durably mark the tenant so that
+    // on restart, we know that we must go through recovery (list shards that exist
+    // and pick up where we left off and/or revert to parent shards).
+    #[allow(dead_code)]
+    pub(crate) async fn begin_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
+        todo!();
+    }
+
+    // TODO: when we finish shard splitting, we must atomically clean up the old shards
+    // and insert the new shards, and clear the splitting marker.
+    #[allow(dead_code)]
+    pub(crate) async fn complete_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
+        todo!();
+    }
+}
+
+/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
+#[derive(Serialize, Deserialize, Clone)]
+pub(crate) struct TenantShardPersistence {
+    #[serde(default)]
+    pub(crate) tenant_id: String,
+    #[serde(default)]
+    pub(crate) shard_number: i32,
+    #[serde(default)]
+    pub(crate) shard_count: i32,
+    #[serde(default)]
+    pub(crate) shard_stripe_size: i32,
+
+    // Currently attached pageserver
+    #[serde(rename = "pageserver")]
+    pub(crate) generation_pageserver: Option<NodeId>,
+
+    // Latest generation number: next time we attach, increment this
+    // and use the incremented number when attaching
+    pub(crate) generation: u32,
+
+    #[serde(default)]
+    pub(crate) placement_policy: String,
+    #[serde(default)]
+    pub(crate) config: String,
+}
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -0,0 +1,495 @@
+use crate::persistence::Persistence;
+use crate::service;
+use control_plane::attachment_service::NodeAvailability;
+use pageserver_api::models::{
+    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
+};
+use pageserver_api::shard::{ShardIdentity, TenantShardId};
+use pageserver_client::mgmt_api;
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::Duration;
+use tokio_util::sync::CancellationToken;
+use utils::generation::Generation;
+use utils::id::{NodeId, TimelineId};
+use utils::lsn::Lsn;
+
+use crate::compute_hook::ComputeHook;
+use crate::node::Node;
+use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
+
+/// Object with the lifetime of the background reconcile task that is created
+/// for tenants which have a difference between their intent and observed states.
+pub(super) struct Reconciler {
+    /// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot
+    /// of a tenant's state from when we spawned a reconcile task.
+    pub(super) tenant_shard_id: TenantShardId,
+    pub(crate) shard: ShardIdentity,
+    pub(crate) generation: Generation,
+    pub(crate) intent: IntentState,
+    pub(crate) config: TenantConfig,
+    pub(crate) observed: ObservedState,
+
+    pub(crate) service_config: service::Config,
+
+    /// A snapshot of the pageservers as they were when we were asked
+    /// to reconcile.
+    pub(crate) pageservers: Arc<HashMap<NodeId, Node>>,
+
+    /// A hook to notify the running postgres instances when we change the location
+    /// of a tenant
+    pub(crate) compute_hook: Arc<ComputeHook>,
+
+    /// A means to abort background reconciliation: it is essential to
+    /// call this when something changes in the original TenantState that
+    /// will make this reconciliation impossible or unnecessary, for
+    /// example when a pageserver node goes offline, or the PlacementPolicy for
+    /// the tenant is changed.
+    pub(crate) cancel: CancellationToken,
+
+    /// Access to persistent storage for updating generation numbers
+    pub(crate) persistence: Arc<Persistence>,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum ReconcileError {
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl Reconciler {
+    async fn location_config(
+        &mut self,
+        node_id: NodeId,
+        config: LocationConfig,
+        flush_ms: Option<Duration>,
+    ) -> anyhow::Result<()> {
+        let node = self
+            .pageservers
+            .get(&node_id)
+            .expect("Pageserver may not be removed while referenced");
+
+        self.observed
+            .locations
+            .insert(node.id, ObservedStateLocation { conf: None });
+
+        tracing::info!("location_config({}) calling: {:?}", node_id, config);
+        let client =
+            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
+        client
+            .location_config(self.tenant_shard_id, config.clone(), flush_ms)
+            .await?;
+        tracing::info!("location_config({}) complete: {:?}", node_id, config);
+
+        self.observed
+            .locations
+            .insert(node.id, ObservedStateLocation { conf: Some(config) });
+
+        Ok(())
+    }
+
+    async fn maybe_live_migrate(&mut self) -> Result<(), ReconcileError> {
+        let destination = if let Some(node_id) = self.intent.attached {
+            match self.observed.locations.get(&node_id) {
+                Some(conf) => {
+                    // We will do a live migration only if the intended destination is not
+                    // currently in an attached state.
+                    match &conf.conf {
+                        Some(conf) if conf.mode == LocationConfigMode::Secondary => {
+                            // Fall through to do a live migration
+                            node_id
+                        }
+                        None | Some(_) => {
+                            // Attached or uncertain: don't do a live migration, proceed
+                            // with a general-case reconciliation
+                            tracing::info!("maybe_live_migrate: destination is None or attached");
+                            return Ok(());
+                        }
+                    }
+                }
+                None => {
+                    // Our destination is not attached: maybe live migrate if some other
+                    // node is currently attached.  Fall through.
+                    node_id
+                }
+            }
+        } else {
+            // No intent to be attached
+            tracing::info!("maybe_live_migrate: no attached intent");
+            return Ok(());
+        };
+
+        let mut origin = None;
+        for (node_id, state) in &self.observed.locations {
+            if let Some(observed_conf) = &state.conf {
+                if observed_conf.mode == LocationConfigMode::AttachedSingle {
+                    let node = self
+                        .pageservers
+                        .get(node_id)
+                        .expect("Nodes may not be removed while referenced");
+                    // We will only attempt live migration if the origin is not offline: this
+                    // avoids trying to do it while reconciling after responding to an HA failover.
+                    if !matches!(node.availability, NodeAvailability::Offline) {
+                        origin = Some(*node_id);
+                        break;
+                    }
+                }
+            }
+        }
+
+        let Some(origin) = origin else {
+            tracing::info!("maybe_live_migrate: no origin found");
+            return Ok(());
+        };
+
+        // We have an origin and a destination: proceed to do the live migration
+        tracing::info!("Live migrating {}->{}", origin, destination);
+        self.live_migrate(origin, destination).await?;
+
+        Ok(())
+    }
+
+    async fn get_lsns(
+        &self,
+        tenant_shard_id: TenantShardId,
+        node_id: &NodeId,
+    ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
+        let node = self
+            .pageservers
+            .get(node_id)
+            .expect("Pageserver may not be removed while referenced");
+
+        let client =
+            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
+
+        let timelines = client.timeline_list(&tenant_shard_id).await?;
+        Ok(timelines
+            .into_iter()
+            .map(|t| (t.timeline_id, t.last_record_lsn))
+            .collect())
+    }
+
+    async fn secondary_download(&self, tenant_shard_id: TenantShardId, node_id: &NodeId) {
+        let node = self
+            .pageservers
+            .get(node_id)
+            .expect("Pageserver may not be removed while referenced");
+
+        let client =
+            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
+
+        match client.tenant_secondary_download(tenant_shard_id).await {
+            Ok(()) => {}
+            Err(_) => {
+                tracing::info!("  (skipping, destination wasn't in secondary mode)")
+            }
+        }
+    }
+
+    async fn await_lsn(
+        &self,
+        tenant_shard_id: TenantShardId,
+        pageserver_id: &NodeId,
+        baseline: HashMap<TimelineId, Lsn>,
+    ) -> anyhow::Result<()> {
+        loop {
+            let latest = match self.get_lsns(tenant_shard_id, pageserver_id).await {
+                Ok(l) => l,
+                Err(e) => {
+                    println!(
+                        "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
+                        pageserver_id
+                    );
+                    std::thread::sleep(Duration::from_millis(500));
+                    continue;
+                }
+            };
+
+            let mut any_behind: bool = false;
+            for (timeline_id, baseline_lsn) in &baseline {
+                match latest.get(timeline_id) {
+                    Some(latest_lsn) => {
+                        println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
+                        if latest_lsn < baseline_lsn {
+                            any_behind = true;
+                        }
+                    }
+                    None => {
+                        // Expected timeline isn't yet visible on migration destination.
+                        // (IRL we would have to account for timeline deletion, but this
+                        //  is just test helper)
+                        any_behind = true;
+                    }
+                }
+            }
+
+            if !any_behind {
+                println!("✅ LSN caught up.  Proceeding...");
+                break;
+            } else {
+                std::thread::sleep(Duration::from_millis(500));
+            }
+        }
+
+        Ok(())
+    }
+
+    pub async fn live_migrate(
+        &mut self,
+        origin_ps_id: NodeId,
+        dest_ps_id: NodeId,
+    ) -> anyhow::Result<()> {
+        // `maybe_live_migrate` is responsibble for sanity of inputs
+        assert!(origin_ps_id != dest_ps_id);
+
+        fn build_location_config(
+            shard: &ShardIdentity,
+            config: &TenantConfig,
+            mode: LocationConfigMode,
+            generation: Option<Generation>,
+            secondary_conf: Option<LocationConfigSecondary>,
+        ) -> LocationConfig {
+            LocationConfig {
+                mode,
+                generation: generation.map(|g| g.into().unwrap()),
+                secondary_conf,
+                tenant_conf: config.clone(),
+                shard_number: shard.number.0,
+                shard_count: shard.count.0,
+                shard_stripe_size: shard.stripe_size.0,
+            }
+        }
+
+        tracing::info!(
+            "🔁 Switching origin pageserver {} to stale mode",
+            origin_ps_id
+        );
+
+        // FIXME: it is incorrect to use self.generation here, we should use the generation
+        // from the ObservedState of the origin pageserver (it might be older than self.generation)
+        let stale_conf = build_location_config(
+            &self.shard,
+            &self.config,
+            LocationConfigMode::AttachedStale,
+            Some(self.generation),
+            None,
+        );
+        self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
+            .await?;
+
+        let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps_id).await?);
+
+        // If we are migrating to a destination that has a secondary location, warm it up first
+        if let Some(destination_conf) = self.observed.locations.get(&dest_ps_id) {
+            if let Some(destination_conf) = &destination_conf.conf {
+                if destination_conf.mode == LocationConfigMode::Secondary {
+                    tracing::info!(
+                        "🔁 Downloading latest layers to destination pageserver {}",
+                        dest_ps_id,
+                    );
+                    self.secondary_download(self.tenant_shard_id, &dest_ps_id)
+                        .await;
+                }
+            }
+        }
+
+        // Increment generation before attaching to new pageserver
+        self.generation = self
+            .persistence
+            .increment_generation(self.tenant_shard_id, Some(dest_ps_id))
+            .await?;
+
+        let dest_conf = build_location_config(
+            &self.shard,
+            &self.config,
+            LocationConfigMode::AttachedMulti,
+            Some(self.generation),
+            None,
+        );
+
+        tracing::info!("🔁 Attaching to pageserver {}", dest_ps_id);
+        self.location_config(dest_ps_id, dest_conf, None).await?;
+
+        if let Some(baseline) = baseline_lsns {
+            tracing::info!("🕑 Waiting for LSN to catch up...");
+            self.await_lsn(self.tenant_shard_id, &dest_ps_id, baseline)
+                .await?;
+        }
+
+        tracing::info!("🔁 Notifying compute to use pageserver {}", dest_ps_id);
+        self.compute_hook
+            .notify(self.tenant_shard_id, dest_ps_id)
+            .await?;
+
+        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Single, then
+        // this location will be deleted in the general case reconciliation that runs after this.
+        let origin_secondary_conf = build_location_config(
+            &self.shard,
+            &self.config,
+            LocationConfigMode::Secondary,
+            None,
+            Some(LocationConfigSecondary { warm: true }),
+        );
+        self.location_config(origin_ps_id, origin_secondary_conf.clone(), None)
+            .await?;
+        // TODO: we should also be setting the ObservedState on earlier API calls, in case we fail
+        // partway through.  In fact, all location conf API calls should be in a wrapper that sets
+        // the observed state to None, then runs, then sets it to what we wrote.
+        self.observed.locations.insert(
+            origin_ps_id,
+            ObservedStateLocation {
+                conf: Some(origin_secondary_conf),
+            },
+        );
+
+        println!(
+            "🔁 Switching to AttachedSingle mode on pageserver {}",
+            dest_ps_id
+        );
+        let dest_final_conf = build_location_config(
+            &self.shard,
+            &self.config,
+            LocationConfigMode::AttachedSingle,
+            Some(self.generation),
+            None,
+        );
+        self.location_config(dest_ps_id, dest_final_conf.clone(), None)
+            .await?;
+        self.observed.locations.insert(
+            dest_ps_id,
+            ObservedStateLocation {
+                conf: Some(dest_final_conf),
+            },
+        );
+
+        println!("✅ Migration complete");
+
+        Ok(())
+    }
+
+    /// Reconciling a tenant makes API calls to pageservers until the observed state
+    /// matches the intended state.
+    ///
+    /// First we apply special case handling (e.g. for live migrations), and then a
+    /// general case reconciliation where we walk through the intent by pageserver
+    /// and call out to the pageserver to apply the desired state.
+    pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> {
+        // TODO: if any of self.observed is None, call to remote pageservers
+        // to learn correct state.
+
+        // Special case: live migration
+        self.maybe_live_migrate().await?;
+
+        // If the attached pageserver is not attached, do so now.
+        if let Some(node_id) = self.intent.attached {
+            let mut wanted_conf =
+                attached_location_conf(self.generation, &self.shard, &self.config);
+            match self.observed.locations.get(&node_id) {
+                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
+                    // Nothing to do
+                    tracing::info!("Observed configuration already correct.")
+                }
+                _ => {
+                    // In all cases other than a matching observed configuration, we will
+                    // reconcile this location.  This includes locations with different configurations, as well
+                    // as locations with unknown (None) observed state.
+                    self.generation = self
+                        .persistence
+                        .increment_generation(self.tenant_shard_id, Some(node_id))
+                        .await?;
+                    wanted_conf.generation = self.generation.into();
+                    tracing::info!("Observed configuration requires update.");
+                    self.location_config(node_id, wanted_conf, None).await?;
+                    if let Err(e) = self
+                        .compute_hook
+                        .notify(self.tenant_shard_id, node_id)
+                        .await
+                    {
+                        tracing::warn!(
+                            "Failed to notify compute of newly attached pageserver {node_id}: {e}"
+                        );
+                    }
+                }
+            }
+        }
+
+        // Configure secondary locations: if these were previously attached this
+        // implicitly downgrades them from attached to secondary.
+        let mut changes = Vec::new();
+        for node_id in &self.intent.secondary {
+            let wanted_conf = secondary_location_conf(&self.shard, &self.config);
+            match self.observed.locations.get(node_id) {
+                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
+                    // Nothing to do
+                    tracing::info!(%node_id, "Observed configuration already correct.")
+                }
+                _ => {
+                    // In all cases other than a matching observed configuration, we will
+                    // reconcile this location.
+                    tracing::info!(%node_id, "Observed configuration requires update.");
+                    changes.push((*node_id, wanted_conf))
+                }
+            }
+        }
+
+        // Detach any extraneous pageservers that are no longer referenced
+        // by our intent.
+        let all_pageservers = self.intent.all_pageservers();
+        for node_id in self.observed.locations.keys() {
+            if all_pageservers.contains(node_id) {
+                // We are only detaching pageservers that aren't used at all.
+                continue;
+            }
+
+            changes.push((
+                *node_id,
+                LocationConfig {
+                    mode: LocationConfigMode::Detached,
+                    generation: None,
+                    secondary_conf: None,
+                    shard_number: self.shard.number.0,
+                    shard_count: self.shard.count.0,
+                    shard_stripe_size: self.shard.stripe_size.0,
+                    tenant_conf: self.config.clone(),
+                },
+            ));
+        }
+
+        for (node_id, conf) in changes {
+            self.location_config(node_id, conf, None).await?;
+        }
+
+        Ok(())
+    }
+}
+
+pub(crate) fn attached_location_conf(
+    generation: Generation,
+    shard: &ShardIdentity,
+    config: &TenantConfig,
+) -> LocationConfig {
+    LocationConfig {
+        mode: LocationConfigMode::AttachedSingle,
+        generation: generation.into(),
+        secondary_conf: None,
+        shard_number: shard.number.0,
+        shard_count: shard.count.0,
+        shard_stripe_size: shard.stripe_size.0,
+        tenant_conf: config.clone(),
+    }
+}
+
+pub(crate) fn secondary_location_conf(
+    shard: &ShardIdentity,
+    config: &TenantConfig,
+) -> LocationConfig {
+    LocationConfig {
+        mode: LocationConfigMode::Secondary,
+        generation: None,
+        secondary_conf: Some(LocationConfigSecondary { warm: true }),
+        shard_number: shard.number.0,
+        shard_count: shard.count.0,
+        shard_stripe_size: shard.stripe_size.0,
+        tenant_conf: config.clone(),
+    }
+}
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -0,0 +1,89 @@
+use pageserver_api::shard::TenantShardId;
+use std::collections::{BTreeMap, HashMap};
+use utils::{http::error::ApiError, id::NodeId};
+
+use crate::{node::Node, tenant_state::TenantState};
+
+/// Scenarios in which we cannot find a suitable location for a tenant shard
+#[derive(thiserror::Error, Debug)]
+pub enum ScheduleError {
+    #[error("No pageservers found")]
+    NoPageservers,
+    #[error("No pageserver found matching constraint")]
+    ImpossibleConstraint,
+}
+
+impl From<ScheduleError> for ApiError {
+    fn from(value: ScheduleError) -> Self {
+        ApiError::Conflict(format!("Scheduling error: {}", value))
+    }
+}
+
+pub(crate) struct Scheduler {
+    tenant_counts: HashMap<NodeId, usize>,
+}
+
+impl Scheduler {
+    pub(crate) fn new(
+        tenants: &BTreeMap<TenantShardId, TenantState>,
+        nodes: &HashMap<NodeId, Node>,
+    ) -> Self {
+        let mut tenant_counts = HashMap::new();
+        for node_id in nodes.keys() {
+            tenant_counts.insert(*node_id, 0);
+        }
+
+        for tenant in tenants.values() {
+            if let Some(ps) = tenant.intent.attached {
+                let entry = tenant_counts.entry(ps).or_insert(0);
+                *entry += 1;
+            }
+        }
+
+        for (node_id, node) in nodes {
+            if !node.may_schedule() {
+                tenant_counts.remove(node_id);
+            }
+        }
+
+        Self { tenant_counts }
+    }
+
+    pub(crate) fn schedule_shard(
+        &mut self,
+        hard_exclude: &[NodeId],
+    ) -> Result<NodeId, ScheduleError> {
+        if self.tenant_counts.is_empty() {
+            return Err(ScheduleError::NoPageservers);
+        }
+
+        let mut tenant_counts: Vec<(NodeId, usize)> = self
+            .tenant_counts
+            .iter()
+            .filter_map(|(k, v)| {
+                if hard_exclude.contains(k) {
+                    None
+                } else {
+                    Some((*k, *v))
+                }
+            })
+            .collect();
+
+        // Sort by tenant count.  Nodes with the same tenant count are sorted by ID.
+        tenant_counts.sort_by_key(|i| (i.1, i.0));
+
+        if tenant_counts.is_empty() {
+            // After applying constraints, no pageservers were left
+            return Err(ScheduleError::ImpossibleConstraint);
+        }
+
+        for (node_id, count) in &tenant_counts {
+            tracing::info!("tenant_counts[{node_id}]={count}");
+        }
+
+        let node_id = tenant_counts.first().unwrap().0;
+        tracing::info!("scheduler selected node {node_id}");
+        *self.tenant_counts.get_mut(&node_id).unwrap() += 1;
+        Ok(node_id)
+    }
+}
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -0,0 +1,455 @@
+use std::{collections::HashMap, sync::Arc, time::Duration};
+
+use control_plane::attachment_service::NodeAvailability;
+use pageserver_api::{
+    models::{LocationConfig, LocationConfigMode, TenantConfig},
+    shard::{ShardIdentity, TenantShardId},
+};
+use tokio::task::JoinHandle;
+use tokio_util::sync::CancellationToken;
+use utils::{
+    generation::Generation,
+    id::NodeId,
+    seqwait::{SeqWait, SeqWaitError},
+};
+
+use crate::{
+    compute_hook::ComputeHook,
+    node::Node,
+    persistence::Persistence,
+    reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler},
+    scheduler::{ScheduleError, Scheduler},
+    service, PlacementPolicy, Sequence,
+};
+
+pub(crate) struct TenantState {
+    pub(crate) tenant_shard_id: TenantShardId,
+
+    pub(crate) shard: ShardIdentity,
+
+    // Runtime only: sequence used to coordinate when updating this object while
+    // with background reconcilers may be running.  A reconciler runs to a particular
+    // sequence.
+    pub(crate) sequence: Sequence,
+
+    // Latest generation number: next time we attach, increment this
+    // and use the incremented number when attaching
+    pub(crate) generation: Generation,
+
+    // High level description of how the tenant should be set up.  Provided
+    // externally.
+    pub(crate) policy: PlacementPolicy,
+
+    // Low level description of exactly which pageservers should fulfil
+    // which role.  Generated by `Self::schedule`.
+    pub(crate) intent: IntentState,
+
+    // Low level description of how the tenant is configured on pageservers:
+    // if this does not match `Self::intent` then the tenant needs reconciliation
+    // with `Self::reconcile`.
+    pub(crate) observed: ObservedState,
+
+    // Tenant configuration, passed through opaquely to the pageserver.  Identical
+    // for all shards in a tenant.
+    pub(crate) config: TenantConfig,
+
+    /// If a reconcile task is currently in flight, it may be joined here (it is
+    /// only safe to join if either the result has been received or the reconciler's
+    /// cancellation token has been fired)
+    pub(crate) reconciler: Option<ReconcilerHandle>,
+
+    /// Optionally wait for reconciliation to complete up to a particular
+    /// sequence number.
+    pub(crate) waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
+
+    /// Indicates sequence number for which we have encountered an error reconciling.  If
+    /// this advances ahead of [`Self::waiter`] then a reconciliation error has occurred,
+    /// and callers should stop waiting for `waiter` and propagate the error.
+    pub(crate) error_waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
+
+    /// The most recent error from a reconcile on this tenant
+    /// TODO: generalize to an array of recent events
+    /// TOOD: use a ArcSwap instead of mutex for faster reads?
+    pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
+}
+
+#[derive(Default, Clone, Debug)]
+pub(crate) struct IntentState {
+    pub(crate) attached: Option<NodeId>,
+    pub(crate) secondary: Vec<NodeId>,
+}
+
+#[derive(Default, Clone)]
+pub(crate) struct ObservedState {
+    pub(crate) locations: HashMap<NodeId, ObservedStateLocation>,
+}
+
+/// Our latest knowledge of how this tenant is configured in the outside world.
+///
+/// Meaning:
+///     * No instance of this type exists for a node: we are certain that we have nothing configured on that
+///       node for this shard.
+///     * Instance exists with conf==None: we *might* have some state on that node, but we don't know
+///       what it is (e.g. we failed partway through configuring it)
+///     * Instance exists with conf==Some: this tells us what we last successfully configured on this node,
+///       and that configuration will still be present unless something external interfered.
+#[derive(Clone)]
+pub(crate) struct ObservedStateLocation {
+    /// If None, it means we do not know the status of this shard's location on this node, but
+    /// we know that we might have some state on this node.
+    pub(crate) conf: Option<LocationConfig>,
+}
+pub(crate) struct ReconcilerWaiter {
+    // For observability purposes, remember the ID of the shard we're
+    // waiting for.
+    pub(crate) tenant_shard_id: TenantShardId,
+
+    seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
+    error_seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
+    error: std::sync::Arc<std::sync::Mutex<String>>,
+    seq: Sequence,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum ReconcileWaitError {
+    #[error("Timeout waiting for shard {0}")]
+    Timeout(TenantShardId),
+    #[error("shutting down")]
+    Shutdown,
+    #[error("Reconcile error on shard {0}: {1}")]
+    Failed(TenantShardId, String),
+}
+
+impl ReconcilerWaiter {
+    pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
+        tokio::select! {
+            result = self.seq_wait.wait_for_timeout(self.seq, timeout)=> {
+                result.map_err(|e| match e {
+                    SeqWaitError::Timeout => ReconcileWaitError::Timeout(self.tenant_shard_id),
+                    SeqWaitError::Shutdown => ReconcileWaitError::Shutdown
+                })?;
+            },
+            result = self.error_seq_wait.wait_for(self.seq) => {
+                result.map_err(|e| match e {
+                    SeqWaitError::Shutdown => ReconcileWaitError::Shutdown,
+                    SeqWaitError::Timeout => unreachable!()
+                })?;
+
+                return Err(ReconcileWaitError::Failed(self.tenant_shard_id, self.error.lock().unwrap().clone()))
+            }
+        }
+
+        Ok(())
+    }
+}
+
+/// Having spawned a reconciler task, the tenant shard's state will carry enough
+/// information to optionally cancel & await it later.
+pub(crate) struct ReconcilerHandle {
+    sequence: Sequence,
+    handle: JoinHandle<()>,
+    cancel: CancellationToken,
+}
+
+/// When a reconcile task completes, it sends this result object
+/// to be applied to the primary TenantState.
+pub(crate) struct ReconcileResult {
+    pub(crate) sequence: Sequence,
+    /// On errors, `observed` should be treated as an incompleted description
+    /// of state (i.e. any nodes present in the result should override nodes
+    /// present in the parent tenant state, but any unmentioned nodes should
+    /// not be removed from parent tenant state)
+    pub(crate) result: Result<(), ReconcileError>,
+
+    pub(crate) tenant_shard_id: TenantShardId,
+    pub(crate) generation: Generation,
+    pub(crate) observed: ObservedState,
+}
+
+impl IntentState {
+    pub(crate) fn new() -> Self {
+        Self {
+            attached: None,
+            secondary: vec![],
+        }
+    }
+    pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
+        let mut result = Vec::new();
+        if let Some(p) = self.attached {
+            result.push(p)
+        }
+
+        result.extend(self.secondary.iter().copied());
+
+        result
+    }
+
+    /// When a node goes offline, we update intents to avoid using it
+    /// as their attached pageserver.
+    ///
+    /// Returns true if a change was made
+    pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
+        if self.attached == Some(node_id) {
+            self.attached = None;
+            self.secondary.push(node_id);
+            true
+        } else {
+            false
+        }
+    }
+}
+
+impl ObservedState {
+    pub(crate) fn new() -> Self {
+        Self {
+            locations: HashMap::new(),
+        }
+    }
+}
+
+impl TenantState {
+    pub(crate) fn new(
+        tenant_shard_id: TenantShardId,
+        shard: ShardIdentity,
+        policy: PlacementPolicy,
+    ) -> Self {
+        Self {
+            tenant_shard_id,
+            policy,
+            intent: IntentState::default(),
+            generation: Generation::new(0),
+            shard,
+            observed: ObservedState::default(),
+            config: TenantConfig::default(),
+            reconciler: None,
+            sequence: Sequence(1),
+            waiter: Arc::new(SeqWait::new(Sequence(0))),
+            error_waiter: Arc::new(SeqWait::new(Sequence(0))),
+            last_error: Arc::default(),
+        }
+    }
+
+    /// For use on startup when learning state from pageservers: generate my [`IntentState`] from my
+    /// [`ObservedState`], even if it violates my [`PlacementPolicy`].  Call [`Self::schedule`] next,
+    /// to get an intent state that complies with placement policy.  The overall goal is to do scheduling
+    /// in a way that makes use of any configured locations that already exist in the outside world.
+    pub(crate) fn intent_from_observed(&mut self) {
+        // Choose an attached location by filtering observed locations, and then sorting to get the highest
+        // generation
+        let mut attached_locs = self
+            .observed
+            .locations
+            .iter()
+            .filter_map(|(node_id, l)| {
+                if let Some(conf) = &l.conf {
+                    if conf.mode == LocationConfigMode::AttachedMulti
+                        || conf.mode == LocationConfigMode::AttachedSingle
+                        || conf.mode == LocationConfigMode::AttachedStale
+                    {
+                        Some((node_id, conf.generation))
+                    } else {
+                        None
+                    }
+                } else {
+                    None
+                }
+            })
+            .collect::<Vec<_>>();
+
+        attached_locs.sort_by_key(|i| i.1);
+        if let Some((node_id, _gen)) = attached_locs.into_iter().last() {
+            self.intent.attached = Some(*node_id);
+        }
+
+        // All remaining observed locations generate secondary intents.  This includes None
+        // observations, as these may well have some local content on disk that is usable (this
+        // is an edge case that might occur if we restarted during a migration or other change)
+        self.observed.locations.keys().for_each(|node_id| {
+            if Some(*node_id) != self.intent.attached {
+                self.intent.secondary.push(*node_id);
+            }
+        });
+    }
+
+    pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
+        // TODO: before scheduling new nodes, check if any existing content in
+        // self.intent refers to pageservers that are offline, and pick other
+        // pageservers if so.
+
+        // Build the set of pageservers already in use by this tenant, to avoid scheduling
+        // more work on the same pageservers we're already using.
+        let mut used_pageservers = self.intent.all_pageservers();
+        let mut modified = false;
+
+        use PlacementPolicy::*;
+        match self.policy {
+            Single => {
+                // Should have exactly one attached, and zero secondaries
+                if self.intent.attached.is_none() {
+                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
+                    self.intent.attached = Some(node_id);
+                    used_pageservers.push(node_id);
+                    modified = true;
+                }
+                if !self.intent.secondary.is_empty() {
+                    self.intent.secondary.clear();
+                    modified = true;
+                }
+            }
+            Double(secondary_count) => {
+                // Should have exactly one attached, and N secondaries
+                if self.intent.attached.is_none() {
+                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
+                    self.intent.attached = Some(node_id);
+                    used_pageservers.push(node_id);
+                    modified = true;
+                }
+
+                while self.intent.secondary.len() < secondary_count {
+                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
+                    self.intent.secondary.push(node_id);
+                    used_pageservers.push(node_id);
+                    modified = true;
+                }
+            }
+        }
+
+        if modified {
+            self.sequence.0 += 1;
+        }
+
+        Ok(())
+    }
+
+    fn dirty(&self) -> bool {
+        if let Some(node_id) = self.intent.attached {
+            let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config);
+            match self.observed.locations.get(&node_id) {
+                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
+                Some(_) | None => {
+                    return true;
+                }
+            }
+        }
+
+        for node_id in &self.intent.secondary {
+            let wanted_conf = secondary_location_conf(&self.shard, &self.config);
+            match self.observed.locations.get(node_id) {
+                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
+                Some(_) | None => {
+                    return true;
+                }
+            }
+        }
+
+        false
+    }
+
+    pub(crate) fn maybe_reconcile(
+        &mut self,
+        result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
+        pageservers: &Arc<HashMap<NodeId, Node>>,
+        compute_hook: &Arc<ComputeHook>,
+        service_config: &service::Config,
+        persistence: &Arc<Persistence>,
+    ) -> Option<ReconcilerWaiter> {
+        // If there are any ambiguous observed states, and the nodes they refer to are available,
+        // we should reconcile to clean them up.
+        let mut dirty_observed = false;
+        for (node_id, observed_loc) in &self.observed.locations {
+            let node = pageservers
+                .get(node_id)
+                .expect("Nodes may not be removed while referenced");
+            if observed_loc.conf.is_none()
+                && !matches!(node.availability, NodeAvailability::Offline)
+            {
+                dirty_observed = true;
+                break;
+            }
+        }
+
+        if !self.dirty() && !dirty_observed {
+            tracing::info!("Not dirty, no reconciliation needed.");
+            return None;
+        }
+
+        // Reconcile already in flight for the current sequence?
+        if let Some(handle) = &self.reconciler {
+            if handle.sequence == self.sequence {
+                return Some(ReconcilerWaiter {
+                    tenant_shard_id: self.tenant_shard_id,
+                    seq_wait: self.waiter.clone(),
+                    error_seq_wait: self.error_waiter.clone(),
+                    error: self.last_error.clone(),
+                    seq: self.sequence,
+                });
+            }
+        }
+
+        // Reconcile in flight for a stale sequence?  Our sequence's task will wait for it before
+        // doing our sequence's work.
+        let old_handle = self.reconciler.take();
+
+        let cancel = CancellationToken::new();
+        let mut reconciler = Reconciler {
+            tenant_shard_id: self.tenant_shard_id,
+            shard: self.shard,
+            generation: self.generation,
+            intent: self.intent.clone(),
+            config: self.config.clone(),
+            observed: self.observed.clone(),
+            pageservers: pageservers.clone(),
+            compute_hook: compute_hook.clone(),
+            service_config: service_config.clone(),
+            cancel: cancel.clone(),
+            persistence: persistence.clone(),
+        };
+
+        let reconcile_seq = self.sequence;
+
+        tracing::info!("Spawning Reconciler for sequence {}", self.sequence);
+        let join_handle = tokio::task::spawn(async move {
+            // Wait for any previous reconcile task to complete before we start
+            if let Some(old_handle) = old_handle {
+                old_handle.cancel.cancel();
+                if let Err(e) = old_handle.handle.await {
+                    // We can't do much with this other than log it: the task is done, so
+                    // we may proceed with our work.
+                    tracing::error!("Unexpected join error waiting for reconcile task: {e}");
+                }
+            }
+
+            // Early check for cancellation before doing any work
+            // TODO: wrap all remote API operations in cancellation check
+            // as well.
+            if reconciler.cancel.is_cancelled() {
+                return;
+            }
+
+            let result = reconciler.reconcile().await;
+            result_tx
+                .send(ReconcileResult {
+                    sequence: reconcile_seq,
+                    result,
+                    tenant_shard_id: reconciler.tenant_shard_id,
+                    generation: reconciler.generation,
+                    observed: reconciler.observed,
+                })
+                .ok();
+        });
+
+        self.reconciler = Some(ReconcilerHandle {
+            sequence: self.sequence,
+            handle: join_handle,
+            cancel,
+        });
+
+        Some(ReconcilerWaiter {
+            tenant_shard_id: self.tenant_shard_id,
+            seq_wait: self.waiter.clone(),
+            error_seq_wait: self.error_waiter.clone(),
+            error: self.last_error.clone(),
+            seq: self.sequence,
+        })
+    }
+}
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -1,14 +1,27 @@
 use crate::{background_process, local_env::LocalEnv};
-use anyhow::anyhow;
 use camino::Utf8PathBuf;
-use serde::{Deserialize, Serialize};
-use std::{path::PathBuf, process::Child};
-use utils::id::{NodeId, TenantId};
+use pageserver_api::{
+    models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
+    shard::TenantShardId,
+};
+use pageserver_client::mgmt_api::ResponseErrorMessageExt;
+use postgres_backend::AuthType;
+use postgres_connection::parse_host_port;
+use reqwest::Method;
+use serde::{de::DeserializeOwned, Deserialize, Serialize};
+use std::{path::PathBuf, process::Child, str::FromStr};
+use tracing::instrument;
+use utils::{
+    auth::{Claims, Scope},
+    id::{NodeId, TenantId},
+};

 pub struct AttachmentService {
    env: LocalEnv,
    listen: String,
    path: PathBuf,
+    jwt_token: Option<String>,
+    public_key_path: Option<Utf8PathBuf>,
    client: reqwest::Client,
 }

@@ -16,7 +29,7 @@ const COMMAND: &str = "attachment_service";

 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
-    pub tenant_id: TenantId,
+    pub tenant_shard_id: TenantShardId,
    pub node_id: Option<NodeId>,
 }

@@ -27,7 +40,7 @@ pub struct AttachHookResponse {

 #[derive(Serialize, Deserialize)]
 pub struct InspectRequest {
-    pub tenant_id: TenantId,
+    pub tenant_shard_id: TenantShardId,
 }

 #[derive(Serialize, Deserialize)]
@@ -35,6 +48,125 @@ pub struct InspectResponse {
    pub attachment: Option<(u32, NodeId)>,
 }

+#[derive(Serialize, Deserialize)]
+pub struct TenantCreateResponseShard {
+    pub node_id: NodeId,
+    pub generation: u32,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantCreateResponse {
+    pub shards: Vec<TenantCreateResponseShard>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct NodeRegisterRequest {
+    pub node_id: NodeId,
+
+    pub listen_pg_addr: String,
+    pub listen_pg_port: u16,
+
+    pub listen_http_addr: String,
+    pub listen_http_port: u16,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct NodeConfigureRequest {
+    pub node_id: NodeId,
+
+    pub availability: Option<NodeAvailability>,
+    pub scheduling: Option<NodeSchedulingPolicy>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantLocateResponseShard {
+    pub shard_id: TenantShardId,
+    pub node_id: NodeId,
+
+    pub listen_pg_addr: String,
+    pub listen_pg_port: u16,
+
+    pub listen_http_addr: String,
+    pub listen_http_port: u16,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantLocateResponse {
+    pub shards: Vec<TenantLocateResponseShard>,
+    pub shard_params: ShardParameters,
+}
+
+/// Explicitly migrating a particular shard is a low level operation
+/// TODO: higher level "Reschedule tenant" operation where the request
+/// specifies some constraints, e.g. asking it to get off particular node(s)
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantShardMigrateRequest {
+    pub tenant_shard_id: TenantShardId,
+    pub node_id: NodeId,
+}
+
+#[derive(Serialize, Deserialize, Clone, Copy)]
+pub enum NodeAvailability {
+    // Normal, happy state
+    Active,
+    // Offline: Tenants shouldn't try to attach here, but they may assume that their
+    // secondary locations on this node still exist.  Newly added nodes are in this
+    // state until we successfully contact them.
+    Offline,
+}
+
+impl FromStr for NodeAvailability {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self::Active),
+            "offline" => Ok(Self::Offline),
+            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
+        }
+    }
+}
+
+/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
+/// type needs to be defined with diesel traits in there.
+#[derive(Serialize, Deserialize, Clone, Copy)]
+pub enum NodeSchedulingPolicy {
+    Active,
+    Filling,
+    Pause,
+    Draining,
+}
+
+impl FromStr for NodeSchedulingPolicy {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self::Active),
+            "filling" => Ok(Self::Filling),
+            "pause" => Ok(Self::Pause),
+            "draining" => Ok(Self::Draining),
+            _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
+        }
+    }
+}
+
+impl From<NodeSchedulingPolicy> for String {
+    fn from(value: NodeSchedulingPolicy) -> String {
+        use NodeSchedulingPolicy::*;
+        match value {
+            Active => "active",
+            Filling => "filling",
+            Pause => "pause",
+            Draining => "draining",
+        }
+        .to_string()
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantShardMigrateResponse {}
+
 impl AttachmentService {
    pub fn from_env(env: &LocalEnv) -> Self {
        let path = env.base_data_dir.join("attachments.json");
@@ -49,10 +181,34 @@ impl AttachmentService {
            listen_url.port().unwrap()
        );

+        // Assume all pageservers have symmetric auth configuration: this service
+        // expects to use one JWT token to talk to all of them.
+        let ps_conf = env
+            .pageservers
+            .first()
+            .expect("Config is validated to contain at least one pageserver");
+        let (jwt_token, public_key_path) = match ps_conf.http_auth_type {
+            AuthType::Trust => (None, None),
+            AuthType::NeonJWT => {
+                let jwt_token = env
+                    .generate_auth_token(&Claims::new(None, Scope::PageServerApi))
+                    .unwrap();
+
+                // If pageserver auth is enabled, this implicitly enables auth for this service,
+                // using the same credentials.
+                let public_key_path =
+                    camino::Utf8PathBuf::try_from(env.base_data_dir.join("auth_public_key.pem"))
+                        .unwrap();
+                (Some(jwt_token), Some(public_key_path))
+            }
+        };
+
        Self {
            env: env.clone(),
            path,
            listen,
+            jwt_token,
+            public_key_path,
            client: reqwest::ClientBuilder::new()
                .build()
                .expect("Failed to construct http client"),
@@ -67,72 +223,199 @@ impl AttachmentService {
    pub async fn start(&self) -> anyhow::Result<Child> {
        let path_str = self.path.to_string_lossy();

-        background_process::start_process(
+        let mut args = vec!["-l", &self.listen, "-p", &path_str]
+            .into_iter()
+            .map(|s| s.to_string())
+            .collect::<Vec<_>>();
+        if let Some(jwt_token) = &self.jwt_token {
+            args.push(format!("--jwt-token={jwt_token}"));
+        }
+
+        if let Some(public_key_path) = &self.public_key_path {
+            args.push(format!("--public-key={public_key_path}"));
+        }
+
+        let result = background_process::start_process(
            COMMAND,
            &self.env.base_data_dir,
            &self.env.attachment_service_bin(),
-            ["-l", &self.listen, "-p", &path_str],
-            [],
+            args,
+            [(
+                "NEON_REPO_DIR".to_string(),
+                self.env.base_data_dir.to_string_lossy().to_string(),
+            )],
            background_process::InitialPidFile::Create(self.pid_file()),
-            // TODO: a real status check
-            || async move { anyhow::Ok(true) },
+            || async {
+                match self.status().await {
+                    Ok(_) => Ok(true),
+                    Err(_) => Ok(false),
+                }
+            },
        )
-        .await
+        .await;
+
+        for ps_conf in &self.env.pageservers {
+            let (pg_host, pg_port) =
+                parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
+            let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
+                .expect("Unable to parse listen_http_addr");
+            self.node_register(NodeRegisterRequest {
+                node_id: ps_conf.id,
+                listen_pg_addr: pg_host.to_string(),
+                listen_pg_port: pg_port.unwrap_or(5432),
+                listen_http_addr: http_host.to_string(),
+                listen_http_port: http_port.unwrap_or(80),
+            })
+            .await?;
+        }
+
+        result
    }

    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
        background_process::stop_process(immediate, COMMAND, &self.pid_file())
    }
-
-    /// Call into the attach_hook API, for use before handing out attachments to pageservers
-    pub async fn attach_hook(
+    /// Simple HTTP request wrapper for calling into attachment service
+    async fn dispatch<RQ, RS>(
        &self,
-        tenant_id: TenantId,
-        pageserver_id: NodeId,
-    ) -> anyhow::Result<Option<u32>> {
-        use hyper::StatusCode;
-
+        method: reqwest::Method,
+        path: String,
+        body: Option<RQ>,
+    ) -> anyhow::Result<RS>
+    where
+        RQ: Serialize + Sized,
+        RS: DeserializeOwned + Sized,
+    {
        let url = self
            .env
            .control_plane_api
            .clone()
            .unwrap()
-            .join("attach-hook")
+            .join(&path)
            .unwrap();

+        let mut builder = self.client.request(method, url);
+        if let Some(body) = body {
+            builder = builder.json(&body)
+        }
+        if let Some(jwt_token) = &self.jwt_token {
+            builder = builder.header(
+                reqwest::header::AUTHORIZATION,
+                format!("Bearer {jwt_token}"),
+            );
+        }
+
+        let response = builder.send().await?;
+        let response = response.error_from_body().await?;
+
+        Ok(response
+            .json()
+            .await
+            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?)
+    }
+
+    /// Call into the attach_hook API, for use before handing out attachments to pageservers
+    #[instrument(skip(self))]
+    pub async fn attach_hook(
+        &self,
+        tenant_shard_id: TenantShardId,
+        pageserver_id: NodeId,
+    ) -> anyhow::Result<Option<u32>> {
        let request = AttachHookRequest {
-            tenant_id,
+            tenant_shard_id,
            node_id: Some(pageserver_id),
        };

-        let response = self.client.post(url).json(&request).send().await?;
-        if response.status() != StatusCode::OK {
-            return Err(anyhow!("Unexpected status {}", response.status()));
-        }
+        let response = self
+            .dispatch::<_, AttachHookResponse>(
+                Method::POST,
+                "attach-hook".to_string(),
+                Some(request),
+            )
+            .await?;

-        let response = response.json::<AttachHookResponse>().await?;
        Ok(response.gen)
    }

-    pub async fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
-        use hyper::StatusCode;
+    #[instrument(skip(self))]
+    pub async fn inspect(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> anyhow::Result<Option<(u32, NodeId)>> {
+        let request = InspectRequest { tenant_shard_id };

-        let url = self
-            .env
-            .control_plane_api
-            .clone()
-            .unwrap()
-            .join("inspect")
-            .unwrap();
+        let response = self
+            .dispatch::<_, InspectResponse>(Method::POST, "inspect".to_string(), Some(request))
+            .await?;

-        let request = InspectRequest { tenant_id };
-
-        let response = self.client.post(url).json(&request).send().await?;
-        if response.status() != StatusCode::OK {
-            return Err(anyhow!("Unexpected status {}", response.status()));
-        }
-
-        let response = response.json::<InspectResponse>().await?;
        Ok(response.attachment)
    }
+
+    #[instrument(skip(self))]
+    pub async fn tenant_create(
+        &self,
+        req: TenantCreateRequest,
+    ) -> anyhow::Result<TenantCreateResponse> {
+        self.dispatch(Method::POST, "tenant".to_string(), Some(req))
+            .await
+    }
+
+    #[instrument(skip(self))]
+    pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
+        self.dispatch::<(), _>(Method::GET, format!("tenant/{tenant_id}/locate"), None)
+            .await
+    }
+
+    #[instrument(skip(self))]
+    pub async fn tenant_migrate(
+        &self,
+        tenant_shard_id: TenantShardId,
+        node_id: NodeId,
+    ) -> anyhow::Result<TenantShardMigrateResponse> {
+        self.dispatch(
+            Method::PUT,
+            format!("tenant/{tenant_shard_id}/migrate"),
+            Some(TenantShardMigrateRequest {
+                tenant_shard_id,
+                node_id,
+            }),
+        )
+        .await
+    }
+
+    #[instrument(skip_all, fields(node_id=%req.node_id))]
+    pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
+        self.dispatch::<_, ()>(Method::POST, "node".to_string(), Some(req))
+            .await
+    }
+
+    #[instrument(skip_all, fields(node_id=%req.node_id))]
+    pub async fn node_configure(&self, req: NodeConfigureRequest) -> anyhow::Result<()> {
+        self.dispatch::<_, ()>(
+            Method::PUT,
+            format!("node/{}/config", req.node_id),
+            Some(req),
+        )
+        .await
+    }
+
+    #[instrument(skip(self))]
+    pub async fn status(&self) -> anyhow::Result<()> {
+        self.dispatch::<(), ()>(Method::GET, "status".to_string(), None)
+            .await
+    }
+
+    #[instrument(skip_all, fields(%tenant_id, timeline_id=%req.new_timeline_id))]
+    pub async fn tenant_timeline_create(
+        &self,
+        tenant_id: TenantId,
+        req: TimelineCreateRequest,
+    ) -> anyhow::Result<TimelineInfo> {
+        self.dispatch(
+            Method::POST,
+            format!("tenant/{tenant_id}/timeline"),
+            Some(req),
+        )
+        .await
+    }
 }
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -1,337 +0,0 @@
-/// The attachment service mimics the aspects of the control plane API
-/// that are required for a pageserver to operate.
-///
-/// This enables running & testing pageservers without a full-blown
-/// deployment of the Neon cloud platform.
-///
-use anyhow::anyhow;
-use clap::Parser;
-use hex::FromHex;
-use hyper::StatusCode;
-use hyper::{Body, Request, Response};
-use pageserver_api::shard::TenantShardId;
-use serde::{Deserialize, Serialize};
-use std::path::{Path, PathBuf};
-use std::{collections::HashMap, sync::Arc};
-use utils::http::endpoint::request_span;
-use utils::logging::{self, LogFormat};
-use utils::signals::{ShutdownSignals, Signal};
-
-use utils::{
-    http::{
-        endpoint::{self},
-        error::ApiError,
-        json::{json_request, json_response},
-        RequestExt, RouterBuilder,
-    },
-    id::{NodeId, TenantId},
-    tcp_listener,
-};
-
-use pageserver_api::control_api::{
-    ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse,
-    ValidateResponseTenant,
-};
-
-use control_plane::attachment_service::{
-    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
-};
-
-#[derive(Parser)]
-#[command(author, version, about, long_about = None)]
-#[command(arg_required_else_help(true))]
-struct Cli {
-    /// Host and port to listen on, like `127.0.0.1:1234`
-    #[arg(short, long)]
-    listen: std::net::SocketAddr,
-
-    /// Path to the .json file to store state (will be created if it doesn't exist)
-    #[arg(short, long)]
-    path: PathBuf,
-}
-
-// The persistent state of each Tenant
-#[derive(Serialize, Deserialize, Clone)]
-struct TenantState {
-    // Currently attached pageserver
-    pageserver: Option<NodeId>,
-
-    // Latest generation number: next time we attach, increment this
-    // and use the incremented number when attaching
-    generation: u32,
-}
-
-fn to_hex_map<S, V>(input: &HashMap<TenantId, V>, serializer: S) -> Result<S::Ok, S::Error>
-where
-    S: serde::Serializer,
-    V: Clone + Serialize,
-{
-    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));
-
-    transformed
-        .collect::<HashMap<String, V>>()
-        .serialize(serializer)
-}
-
-fn from_hex_map<'de, D, V>(deserializer: D) -> Result<HashMap<TenantId, V>, D::Error>
-where
-    D: serde::de::Deserializer<'de>,
-    V: Deserialize<'de>,
-{
-    let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
-    hex_map
-        .into_iter()
-        .map(|(k, v)| {
-            TenantId::from_hex(k)
-                .map(|k| (k, v))
-                .map_err(serde::de::Error::custom)
-        })
-        .collect()
-}
-
-// Top level state available to all HTTP handlers
-#[derive(Serialize, Deserialize)]
-struct PersistentState {
-    #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
-    tenants: HashMap<TenantId, TenantState>,
-
-    #[serde(skip)]
-    path: PathBuf,
-}
-
-impl PersistentState {
-    async fn save(&self) -> anyhow::Result<()> {
-        let bytes = serde_json::to_vec(self)?;
-        tokio::fs::write(&self.path, &bytes).await?;
-
-        Ok(())
-    }
-
-    async fn load(path: &Path) -> anyhow::Result<Self> {
-        let bytes = tokio::fs::read(path).await?;
-        let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
-        decoded.path = path.to_owned();
-        Ok(decoded)
-    }
-
-    async fn load_or_new(path: &Path) -> Self {
-        match Self::load(path).await {
-            Ok(s) => {
-                tracing::info!("Loaded state file at {}", path.display());
-                s
-            }
-            Err(e)
-                if e.downcast_ref::<std::io::Error>()
-                    .map(|e| e.kind() == std::io::ErrorKind::NotFound)
-                    .unwrap_or(false) =>
-            {
-                tracing::info!("Will create state file at {}", path.display());
-                Self {
-                    tenants: HashMap::new(),
-                    path: path.to_owned(),
-                }
-            }
-            Err(e) => {
-                panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path.display())
-            }
-        }
-    }
-}
-
-/// State available to HTTP request handlers
-#[derive(Clone)]
-struct State {
-    inner: Arc<tokio::sync::RwLock<PersistentState>>,
-}
-
-impl State {
-    fn new(persistent_state: PersistentState) -> State {
-        Self {
-            inner: Arc::new(tokio::sync::RwLock::new(persistent_state)),
-        }
-    }
-}
-
-#[inline(always)]
-fn get_state(request: &Request<Body>) -> &State {
-    request
-        .data::<Arc<State>>()
-        .expect("unknown state type")
-        .as_ref()
-}
-
-/// Pageserver calls into this on startup, to learn which tenants it should attach
-async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
-
-    let state = get_state(&req).inner.clone();
-    let mut locked = state.write().await;
-
-    let mut response = ReAttachResponse {
-        tenants: Vec::new(),
-    };
-    for (t, state) in &mut locked.tenants {
-        if state.pageserver == Some(reattach_req.node_id) {
-            state.generation += 1;
-            response.tenants.push(ReAttachResponseTenant {
-                // TODO(sharding): make this shard-aware
-                id: TenantShardId::unsharded(*t),
-                gen: state.generation,
-            });
-        }
-    }
-
-    locked.save().await.map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, response)
-}
-
-/// Pageserver calls into this before doing deletions, to confirm that it still
-/// holds the latest generation for the tenants with deletions enqueued
-async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let validate_req = json_request::<ValidateRequest>(&mut req).await?;
-
-    let locked = get_state(&req).inner.read().await;
-
-    let mut response = ValidateResponse {
-        tenants: Vec::new(),
-    };
-
-    for req_tenant in validate_req.tenants {
-        // TODO(sharding): make this shard-aware
-        if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) {
-            let valid = tenant_state.generation == req_tenant.gen;
-            tracing::info!(
-                "handle_validate: {}(gen {}): valid={valid} (latest {})",
-                req_tenant.id,
-                req_tenant.gen,
-                tenant_state.generation
-            );
-            response.tenants.push(ValidateResponseTenant {
-                id: req_tenant.id,
-                valid,
-            });
-        }
-    }
-
-    json_response(StatusCode::OK, response)
-}
-/// Call into this before attaching a tenant to a pageserver, to acquire a generation number
-/// (in the real control plane this is unnecessary, because the same program is managing
-///  generation numbers and doing attachments).
-async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
-
-    let state = get_state(&req).inner.clone();
-    let mut locked = state.write().await;
-
-    let tenant_state = locked
-        .tenants
-        .entry(attach_req.tenant_id)
-        .or_insert_with(|| TenantState {
-            pageserver: attach_req.node_id,
-            generation: 0,
-        });
-
-    if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
-        tenant_state.generation += 1;
-        tracing::info!(
-            tenant_id = %attach_req.tenant_id,
-            ps_id = %attaching_pageserver,
-            generation = %tenant_state.generation,
-            "issuing",
-        );
-    } else if let Some(ps_id) = tenant_state.pageserver {
-        tracing::info!(
-            tenant_id = %attach_req.tenant_id,
-            %ps_id,
-            generation = %tenant_state.generation,
-            "dropping",
-        );
-    } else {
-        tracing::info!(
-            tenant_id = %attach_req.tenant_id,
-            "no-op: tenant already has no pageserver");
-    }
-    tenant_state.pageserver = attach_req.node_id;
-    let generation = tenant_state.generation;
-
-    tracing::info!(
-        "handle_attach_hook: tenant {} set generation {}, pageserver {}",
-        attach_req.tenant_id,
-        tenant_state.generation,
-        attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
-    );
-
-    locked.save().await.map_err(ApiError::InternalServerError)?;
-
-    json_response(
-        StatusCode::OK,
-        AttachHookResponse {
-            gen: attach_req.node_id.map(|_| generation),
-        },
-    )
-}
-
-async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let inspect_req = json_request::<InspectRequest>(&mut req).await?;
-
-    let state = get_state(&req).inner.clone();
-    let locked = state.write().await;
-    let tenant_state = locked.tenants.get(&inspect_req.tenant_id);
-
-    json_response(
-        StatusCode::OK,
-        InspectResponse {
-            attachment: tenant_state.and_then(|s| s.pageserver.map(|ps| (s.generation, ps))),
-        },
-    )
-}
-
-fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
-    endpoint::make_router()
-        .data(Arc::new(State::new(persistent_state)))
-        .post("/re-attach", |r| request_span(r, handle_re_attach))
-        .post("/validate", |r| request_span(r, handle_validate))
-        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
-        .post("/inspect", |r| request_span(r, handle_inspect))
-}
-
-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
-    logging::init(
-        LogFormat::Plain,
-        logging::TracingErrorLayerEnablement::Disabled,
-        logging::Output::Stdout,
-    )?;
-
-    let args = Cli::parse();
-    tracing::info!(
-        "Starting, state at {}, listening on {}",
-        args.path.to_string_lossy(),
-        args.listen
-    );
-
-    let persistent_state = PersistentState::load_or_new(&args.path).await;
-
-    let http_listener = tcp_listener::bind(args.listen)?;
-    let router = make_router(persistent_state)
-        .build()
-        .map_err(|err| anyhow!(err))?;
-    let service = utils::http::RouterService::new(router).unwrap();
-    let server = hyper::Server::from_tcp(http_listener)?.serve(service);
-
-    tracing::info!("Serving on {0}", args.listen);
-
-    tokio::task::spawn(server);
-
-    ShutdownSignals::handle(|signal| match signal {
-        Signal::Interrupt | Signal::Terminate | Signal::Quit => {
-            tracing::info!("Got {}. Terminating", signal.name());
-            // We're just a test helper: no graceful shutdown.
-            std::process::exit(0);
-        }
-    })?;
-
-    Ok(())
-}
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,19 +8,24 @@
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
 use compute_api::spec::ComputeMode;
-use control_plane::attachment_service::AttachmentService;
+use control_plane::attachment_service::{
+    AttachmentService, NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
+};
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::{InitForceMode, LocalEnv};
 use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
-use control_plane::tenant_migration::migrate_tenant;
 use control_plane::{broker, local_env};
-use pageserver_api::models::TimelineInfo;
+use pageserver_api::models::{
+    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
+};
+use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
 use pageserver_api::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
 };
 use postgres_backend::AuthType;
+use postgres_connection::parse_host_port;
 use safekeeper_api::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
@@ -30,6 +35,7 @@ use std::path::PathBuf;
 use std::process::exit;
 use std::str::FromStr;
 use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
+use url::Host;
 use utils::{
    auth::{Claims, Scope},
    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -276,10 +282,10 @@ fn print_timeline(
 /// Connects to the pageserver to query this information.
 async fn get_timeline_infos(
    env: &local_env::LocalEnv,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
 ) -> Result<HashMap<TimelineId, TimelineInfo>> {
    Ok(get_default_pageserver(env)
-        .timeline_list(tenant_id)
+        .timeline_list(tenant_shard_id)
        .await?
        .into_iter()
        .map(|timeline_info| (timeline_info.timeline_id, timeline_info))
@@ -297,6 +303,20 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R
    }
 }

+// Helper function to parse --tenant_id option, for commands that accept a shard suffix
+fn get_tenant_shard_id(
+    sub_match: &ArgMatches,
+    env: &local_env::LocalEnv,
+) -> anyhow::Result<TenantShardId> {
+    if let Some(tenant_id_from_arguments) = parse_tenant_shard_id(sub_match).transpose() {
+        tenant_id_from_arguments
+    } else if let Some(default_id) = env.default_tenant_id {
+        Ok(TenantShardId::unsharded(default_id))
+    } else {
+        anyhow::bail!("No tenant shard id. Use --tenant-id, or set a default tenant");
+    }
+}
+
 fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
    sub_match
        .get_one::<String>("tenant-id")
@@ -305,6 +325,14 @@ fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
        .context("Failed to parse tenant id from the argument string")
 }

+fn parse_tenant_shard_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantShardId>> {
+    sub_match
+        .get_one::<String>("tenant-id")
+        .map(|id_str| TenantShardId::from_str(id_str))
+        .transpose()
+        .context("Failed to parse tenant shard id from the argument string")
+}
+
 fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId>> {
    sub_match
        .get_one::<String>("timeline-id")
@@ -393,47 +421,68 @@ async fn handle_tenant(
        Some(("create", create_match)) => {
            let tenant_conf: HashMap<_, _> = create_match
                .get_many::<String>("config")
-                .map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
+                .map(|vals: clap::parser::ValuesRef<'_, String>| {
+                    vals.flat_map(|c| c.split_once(':')).collect()
+                })
                .unwrap_or_default();

+            let shard_count: u8 = create_match
+                .get_one::<u8>("shard-count")
+                .cloned()
+                .unwrap_or(0);
+
+            let shard_stripe_size: Option<u32> =
+                create_match.get_one::<u32>("shard-stripe-size").cloned();
+
+            let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
+
            // If tenant ID was not specified, generate one
            let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);

-            let generation = if env.control_plane_api.is_some() {
-                // We must register the tenant with the attachment service, so
-                // that when the pageserver restarts, it will be re-attached.
-                let attachment_service = AttachmentService::from_env(env);
-                attachment_service
-                    .attach_hook(tenant_id, pageserver.conf.id)
-                    .await?
-            } else {
-                None
-            };
-
-            pageserver
-                .tenant_create(tenant_id, generation, tenant_conf)
+            // We must register the tenant with the attachment service, so
+            // that when the pageserver restarts, it will be re-attached.
+            let attachment_service = AttachmentService::from_env(env);
+            attachment_service
+                .tenant_create(TenantCreateRequest {
+                    // Note that ::unsharded here isn't actually because the tenant is unsharded, its because the
+                    // attachment service expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
+                    // type is used both in attachment service (for creating tenants) and in pageserver (for creating shards)
+                    new_tenant_id: TenantShardId::unsharded(tenant_id),
+                    generation: None,
+                    shard_parameters: ShardParameters {
+                        count: ShardCount(shard_count),
+                        stripe_size: shard_stripe_size
+                            .map(ShardStripeSize)
+                            .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
+                    },
+                    config: tenant_conf,
+                })
                .await?;
            println!("tenant {tenant_id} successfully created on the pageserver");

            // Create an initial timeline for the new tenant
-            let new_timeline_id = parse_timeline_id(create_match)?;
+            let new_timeline_id =
+                parse_timeline_id(create_match)?.unwrap_or(TimelineId::generate());
            let pg_version = create_match
                .get_one::<u32>("pg-version")
                .copied()
                .context("Failed to parse postgres version from the argument string")?;

-            let timeline_info = pageserver
-                .timeline_create(
+            // FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have
+            // different shards picking different start lsns.  Maybe we have to teach attachment service
+            // to let shard 0 branch first and then propagate the chosen LSN to other shards.
+            attachment_service
+                .tenant_timeline_create(
                    tenant_id,
-                    new_timeline_id,
-                    None,
-                    None,
-                    Some(pg_version),
-                    None,
+                    TimelineCreateRequest {
+                        new_timeline_id,
+                        ancestor_timeline_id: None,
+                        ancestor_start_lsn: None,
+                        existing_initdb_timeline_id: None,
+                        pg_version: Some(pg_version),
+                    },
                )
                .await?;
-            let new_timeline_id = timeline_info.timeline_id;
-            let last_record_lsn = timeline_info.last_record_lsn;

            env.register_branch_mapping(
                DEFAULT_BRANCH_NAME.to_string(),
@@ -441,9 +490,7 @@ async fn handle_tenant(
                new_timeline_id,
            )?;

-            println!(
-                "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {tenant_id}",
-            );
+            println!("Created an initial timeline '{new_timeline_id}' for tenant: {tenant_id}",);

            if create_match.get_flag("set-default") {
                println!("Setting tenant {tenant_id} as a default one");
@@ -470,14 +517,64 @@ async fn handle_tenant(
            println!("tenant {tenant_id} successfully configured on the pageserver");
        }
        Some(("migrate", matches)) => {
-            let tenant_id = get_tenant_id(matches, env)?;
+            let tenant_shard_id = get_tenant_shard_id(matches, env)?;
            let new_pageserver = get_pageserver(env, matches)?;
            let new_pageserver_id = new_pageserver.conf.id;

-            migrate_tenant(env, tenant_id, new_pageserver).await?;
-            println!("tenant {tenant_id} migrated to {}", new_pageserver_id);
-        }
+            let attachment_service = AttachmentService::from_env(env);
+            attachment_service
+                .tenant_migrate(tenant_shard_id, new_pageserver_id)
+                .await?;

+            println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
+        }
+        Some(("status", matches)) => {
+            let tenant_id = get_tenant_id(matches, env)?;
+
+            let mut shard_table = comfy_table::Table::new();
+            shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
+
+            let mut tenant_synthetic_size = None;
+
+            let attachment_service = AttachmentService::from_env(env);
+            for shard in attachment_service.tenant_locate(tenant_id).await?.shards {
+                let pageserver =
+                    PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
+
+                let size = pageserver
+                    .http_client
+                    .tenant_details(shard.shard_id)
+                    .await?
+                    .tenant_info
+                    .current_physical_size
+                    .unwrap();
+
+                shard_table.add_row([
+                    format!("{}", shard.shard_id.shard_slug()),
+                    format!("{}", shard.node_id.0),
+                    format!("{} MiB", size / (1024 * 1024)),
+                ]);
+
+                if shard.shard_id.is_zero() {
+                    tenant_synthetic_size =
+                        Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
+                }
+            }
+
+            let Some(synthetic_size) = tenant_synthetic_size else {
+                bail!("Shard 0 not found")
+            };
+
+            let mut tenant_table = comfy_table::Table::new();
+            tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
+            tenant_table.add_row([
+                "Synthetic size".to_string(),
+                format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
+            ]);
+
+            println!("{tenant_table}");
+            println!("{shard_table}");
+        }
        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
        None => bail!("no tenant subcommand provided"),
    }
@@ -489,8 +586,10 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local

    match timeline_match.subcommand() {
        Some(("list", list_match)) => {
-            let tenant_id = get_tenant_id(list_match, env)?;
-            let timelines = pageserver.timeline_list(&tenant_id).await?;
+            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
+            // where shard 0 is attached, and query there.
+            let tenant_shard_id = get_tenant_shard_id(list_match, env)?;
+            let timelines = pageserver.timeline_list(&tenant_shard_id).await?;
            print_timelines_tree(timelines, env.timeline_name_mappings())?;
        }
        Some(("create", create_match)) => {
@@ -505,18 +604,19 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                .context("Failed to parse postgres version from the argument string")?;

            let new_timeline_id_opt = parse_timeline_id(create_match)?;
+            let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate());

-            let timeline_info = pageserver
-                .timeline_create(
-                    tenant_id,
-                    new_timeline_id_opt,
-                    None,
-                    None,
-                    Some(pg_version),
-                    None,
-                )
+            let attachment_service = AttachmentService::from_env(env);
+            let create_req = TimelineCreateRequest {
+                new_timeline_id,
+                ancestor_timeline_id: None,
+                existing_initdb_timeline_id: None,
+                ancestor_start_lsn: None,
+                pg_version: Some(pg_version),
+            };
+            let timeline_info = attachment_service
+                .tenant_timeline_create(tenant_id, create_req)
                .await?;
-            let new_timeline_id = timeline_info.timeline_id;

            let last_record_lsn = timeline_info.last_record_lsn;
            env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;
@@ -574,7 +674,6 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                None,
                pg_version,
                ComputeMode::Primary,
-                DEFAULT_PAGESERVER_ID,
            )?;
            println!("Done");
        }
@@ -598,17 +697,18 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                .map(|lsn_str| Lsn::from_str(lsn_str))
                .transpose()
                .context("Failed to parse ancestor start Lsn from the request")?;
-            let timeline_info = pageserver
-                .timeline_create(
-                    tenant_id,
-                    None,
-                    start_lsn,
-                    Some(ancestor_timeline_id),
-                    None,
-                    None,
-                )
+            let new_timeline_id = TimelineId::generate();
+            let attachment_service = AttachmentService::from_env(env);
+            let create_req = TimelineCreateRequest {
+                new_timeline_id,
+                ancestor_timeline_id: Some(ancestor_timeline_id),
+                existing_initdb_timeline_id: None,
+                ancestor_start_lsn: start_lsn,
+                pg_version: None,
+            };
+            let timeline_info = attachment_service
+                .tenant_timeline_create(tenant_id, create_req)
                .await?;
-            let new_timeline_id = timeline_info.timeline_id;

            let last_record_lsn = timeline_info.last_record_lsn;

@@ -635,8 +735,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re

    match sub_name {
        "list" => {
-            let tenant_id = get_tenant_id(sub_args, env)?;
-            let timeline_infos = get_timeline_infos(env, &tenant_id)
+            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
+            // where shard 0 is attached, and query there.
+            let tenant_shard_id = get_tenant_shard_id(sub_args, env)?;
+            let timeline_infos = get_timeline_infos(env, &tenant_shard_id)
                .await
                .unwrap_or_else(|e| {
                    eprintln!("Failed to load timeline info: {}", e);
@@ -661,7 +763,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
            for (endpoint_id, endpoint) in cplane
                .endpoints
                .iter()
-                .filter(|(_, endpoint)| endpoint.tenant_id == tenant_id)
+                .filter(|(_, endpoint)| endpoint.tenant_id == tenant_shard_id.tenant_id)
            {
                let lsn_str = match endpoint.mode {
                    ComputeMode::Static(lsn) => {
@@ -680,7 +782,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                };

                let branch_name = timeline_name_mappings
-                    .get(&TenantTimelineId::new(tenant_id, endpoint.timeline_id))
+                    .get(&TenantTimelineId::new(
+                        tenant_shard_id.tenant_id,
+                        endpoint.timeline_id,
+                    ))
                    .map(|name| name.as_str())
                    .unwrap_or("?");

@@ -728,13 +833,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                .copied()
                .unwrap_or(false);

-            let pageserver_id =
-                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
-                    NodeId(id_str.parse().context("while parsing pageserver id")?)
-                } else {
-                    DEFAULT_PAGESERVER_ID
-                };
-
            let mode = match (lsn, hot_standby) {
                (Some(lsn), false) => ComputeMode::Static(lsn),
                (None, true) => ComputeMode::Replica,
@@ -762,7 +860,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                http_port,
                pg_version,
                mode,
-                pageserver_id,
            )?;
        }
        "start" => {
@@ -772,9 +869,11 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re

            let pageserver_id =
                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
-                    NodeId(id_str.parse().context("while parsing pageserver id")?)
+                    Some(NodeId(
+                        id_str.parse().context("while parsing pageserver id")?,
+                    ))
                } else {
-                    DEFAULT_PAGESERVER_ID
+                    None
                };

            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
@@ -805,7 +904,38 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                endpoint.timeline_id,
            )?;

-            let ps_conf = env.get_pageserver_conf(pageserver_id)?;
+            let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
+                let conf = env.get_pageserver_conf(pageserver_id).unwrap();
+                let parsed = parse_host_port(&conf.listen_pg_addr).expect("Bad config");
+                (
+                    vec![(parsed.0, parsed.1.unwrap_or(5432))],
+                    // If caller is telling us what pageserver to use, this is not a tenant which is
+                    // full managed by attachment service, therefore not sharded.
+                    ShardParameters::DEFAULT_STRIPE_SIZE,
+                )
+            } else {
+                // Look up the currently attached location of the tenant, and its striping metadata,
+                // to pass these on to postgres.
+                let attachment_service = AttachmentService::from_env(env);
+                let locate_result = attachment_service.tenant_locate(endpoint.tenant_id).await?;
+                let pageservers = locate_result
+                    .shards
+                    .into_iter()
+                    .map(|shard| {
+                        (
+                            Host::parse(&shard.listen_pg_addr)
+                                .expect("Attachment service reported bad hostname"),
+                            shard.listen_pg_port,
+                        )
+                    })
+                    .collect::<Vec<_>>();
+                let stripe_size = locate_result.shard_params.stripe_size;
+
+                (pageservers, stripe_size)
+            };
+            assert!(!pageservers.is_empty());
+
+            let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?;
            let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
                let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant);

@@ -816,7 +946,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re

            println!("Starting existing endpoint {endpoint_id}...");
            endpoint
-                .start(&auth_token, safekeepers, remote_ext_config)
+                .start(
+                    &auth_token,
+                    safekeepers,
+                    pageservers,
+                    remote_ext_config,
+                    stripe_size.0 as usize,
+                )
                .await?;
        }
        "reconfigure" => {
@@ -827,15 +963,31 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                .endpoints
                .get(endpoint_id.as_str())
                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
-            let pageserver_id =
+            let pageservers =
                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
-                    Some(NodeId(
-                        id_str.parse().context("while parsing pageserver id")?,
-                    ))
+                    let ps_id = NodeId(id_str.parse().context("while parsing pageserver id")?);
+                    let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(ps_id)?);
+                    vec![(
+                        pageserver.pg_connection_config.host().clone(),
+                        pageserver.pg_connection_config.port(),
+                    )]
                } else {
-                    None
+                    let attachment_service = AttachmentService::from_env(env);
+                    attachment_service
+                        .tenant_locate(endpoint.tenant_id)
+                        .await?
+                        .shards
+                        .into_iter()
+                        .map(|shard| {
+                            (
+                                Host::parse(&shard.listen_pg_addr)
+                                    .expect("Attachment service reported malformed host"),
+                                shard.listen_pg_port,
+                            )
+                        })
+                        .collect::<Vec<_>>()
                };
-            endpoint.reconfigure(pageserver_id).await?;
+            endpoint.reconfigure(pageservers).await?;
        }
        "stop" => {
            let endpoint_id = sub_args
@@ -959,6 +1111,21 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
            }
        }

+        Some(("set-state", subcommand_args)) => {
+            let pageserver = get_pageserver(env, subcommand_args)?;
+            let scheduling = subcommand_args.get_one("scheduling");
+            let availability = subcommand_args.get_one("availability");
+
+            let attachment_service = AttachmentService::from_env(env);
+            attachment_service
+                .node_configure(NodeConfigureRequest {
+                    node_id: pageserver.conf.id,
+                    scheduling: scheduling.cloned(),
+                    availability: availability.cloned(),
+                })
+                .await?;
+        }
+
        Some(("status", subcommand_args)) => {
            match get_pageserver(env, subcommand_args)?.check_status().await {
                Ok(_) => println!("Page server is up and running"),
@@ -1358,6 +1525,8 @@ fn cli() -> Command {
                .arg(pg_version_arg.clone())
                .arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false)
                    .help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
+                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
+                .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
                )
            .subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
                .about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
@@ -1368,6 +1537,9 @@ fn cli() -> Command {
                .about("Migrate a tenant from one pageserver to another")
                .arg(tenant_id_arg.clone())
                .arg(pageserver_id_arg.clone()))
+            .subcommand(Command::new("status")
+                .about("Human readable summary of the tenant's shards and attachment locations")
+                .arg(tenant_id_arg.clone()))
        )
        .subcommand(
            Command::new("pageserver")
@@ -1387,6 +1559,12 @@ fn cli() -> Command {
                    .about("Restart local pageserver")
                    .arg(pageserver_config_args.clone())
                )
+                .subcommand(Command::new("set-state")
+                    .arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
+                    .arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
+                    .about("Set scheduling or availability state of pageserver node")
+                    .arg(pageserver_config_args.clone())
+                )
        )
        .subcommand(
            Command::new("attachment_service")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -49,10 +49,11 @@ use compute_api::spec::RemoteExtSpec;
 use nix::sys::signal::kill;
 use nix::sys::signal::Signal;
 use serde::{Deserialize, Serialize};
+use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};

+use crate::attachment_service::AttachmentService;
 use crate::local_env::LocalEnv;
-use crate::pageserver::PageServerNode;
 use crate::postgresql_conf::PostgresConf;

 use compute_api::responses::{ComputeState, ComputeStatus};
@@ -69,7 +70,6 @@ pub struct EndpointConf {
    http_port: u16,
    pg_version: u32,
    skip_pg_catalog_updates: bool,
-    pageserver_id: NodeId,
 }

 //
@@ -121,19 +121,14 @@ impl ComputeControlPlane {
        http_port: Option<u16>,
        pg_version: u32,
        mode: ComputeMode,
-        pageserver_id: NodeId,
    ) -> Result<Arc<Endpoint>> {
        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
-        let pageserver =
-            PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
-
        let ep = Arc::new(Endpoint {
            endpoint_id: endpoint_id.to_owned(),
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
            env: self.env.clone(),
-            pageserver,
            timeline_id,
            mode,
            tenant_id,
@@ -159,7 +154,6 @@ impl ComputeControlPlane {
                pg_port,
                pg_version,
                skip_pg_catalog_updates: true,
-                pageserver_id,
            })?,
        )?;
        std::fs::write(
@@ -218,7 +212,6 @@ pub struct Endpoint {
    // These are not part of the endpoint as such, but the environment
    // the endpoint runs in.
    pub env: LocalEnv,
-    pageserver: PageServerNode,

    // Optimizations
    skip_pg_catalog_updates: bool,
@@ -241,15 +234,11 @@ impl Endpoint {
        let conf: EndpointConf =
            serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;

-        let pageserver =
-            PageServerNode::from_env(env, env.get_pageserver_conf(conf.pageserver_id)?);
-
        Ok(Endpoint {
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
            endpoint_id,
            env: env.clone(),
-            pageserver,
            timeline_id: conf.timeline_id,
            mode: conf.mode,
            tenant_id: conf.tenant_id,
@@ -469,11 +458,21 @@ impl Endpoint {
        }
    }

+    fn build_pageserver_connstr(pageservers: &[(Host, u16)]) -> String {
+        pageservers
+            .iter()
+            .map(|(host, port)| format!("postgresql://no_user@{host}:{port}"))
+            .collect::<Vec<_>>()
+            .join(",")
+    }
+
    pub async fn start(
        &self,
        auth_token: &Option<String>,
        safekeepers: Vec<NodeId>,
+        pageservers: Vec<(Host, u16)>,
        remote_ext_config: Option<&String>,
+        shard_stripe_size: usize,
    ) -> Result<()> {
        if self.status() == "running" {
            anyhow::bail!("The endpoint is already running");
@@ -487,13 +486,9 @@ impl Endpoint {
            std::fs::remove_dir_all(self.pgdata())?;
        }

-        let pageserver_connstring = {
-            let config = &self.pageserver.pg_connection_config;
-            let (host, port) = (config.host(), config.port());
+        let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
+        assert!(!pageserver_connstring.is_empty());

-            // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
-            format!("postgresql://no_user@{host}:{port}")
-        };
        let mut safekeeper_connstrings = Vec::new();
        if self.mode == ComputeMode::Primary {
            for sk_id in safekeepers {
@@ -543,6 +538,7 @@ impl Endpoint {
            storage_auth_token: auth_token.clone(),
            remote_extensions,
            pgbouncer_settings: None,
+            shard_stripe_size: Some(shard_stripe_size),
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -665,7 +661,7 @@ impl Endpoint {
        }
    }

-    pub async fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
+    pub async fn reconfigure(&self, mut pageservers: Vec<(Host, u16)>) -> Result<()> {
        let mut spec: ComputeSpec = {
            let spec_path = self.endpoint_path().join("spec.json");
            let file = std::fs::File::open(spec_path)?;
@@ -675,25 +671,27 @@ impl Endpoint {
        let postgresql_conf = self.read_postgresql_conf()?;
        spec.cluster.postgresql_conf = Some(postgresql_conf);

-        if let Some(pageserver_id) = pageserver_id {
-            let endpoint_config_path = self.endpoint_path().join("endpoint.json");
-            let mut endpoint_conf: EndpointConf = {
-                let file = std::fs::File::open(&endpoint_config_path)?;
-                serde_json::from_reader(file)?
-            };
-            endpoint_conf.pageserver_id = pageserver_id;
-            std::fs::write(
-                endpoint_config_path,
-                serde_json::to_string_pretty(&endpoint_conf)?,
-            )?;
-
-            let pageserver =
-                PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
-            let ps_http_conf = &pageserver.pg_connection_config;
-            let (host, port) = (ps_http_conf.host(), ps_http_conf.port());
-            spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
+        // If we weren't given explicit pageservers, query the attachment service
+        if pageservers.is_empty() {
+            let attachment_service = AttachmentService::from_env(&self.env);
+            let locate_result = attachment_service.tenant_locate(self.tenant_id).await?;
+            pageservers = locate_result
+                .shards
+                .into_iter()
+                .map(|shard| {
+                    (
+                        Host::parse(&shard.listen_pg_addr)
+                            .expect("Attachment service reported bad hostname"),
+                        shard.listen_pg_port,
+                    )
+                })
+                .collect::<Vec<_>>();
        }

+        let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
+        assert!(!pageserver_connstr.is_empty());
+        spec.pageserver_connstring = Some(pageserver_connstr);
+
        let client = reqwest::Client::new();
        let response = client
            .post(format!(
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -14,4 +14,3 @@ pub mod local_env;
 pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
-pub mod tenant_migration;
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -251,7 +251,13 @@ impl LocalEnv {
        if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) {
            Ok(conf)
        } else {
-            bail!("could not find pageserver {id}")
+            let have_ids = self
+                .pageservers
+                .iter()
+                .map(|node| format!("{}:{}", node.id, node.listen_http_addr))
+                .collect::<Vec<_>>();
+            let joined = have_ids.join(",");
+            bail!("could not find pageserver {id}, have ids {joined}")
        }
    }

--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,7 +17,9 @@ use std::time::Duration;
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
 use futures::SinkExt;
-use pageserver_api::models::{self, LocationConfig, TenantInfo, TimelineInfo};
+use pageserver_api::models::{
+    self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
+};
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
@@ -106,6 +108,16 @@ impl PageServerNode {
                "control_plane_api='{}'",
                control_plane_api.as_str()
            ));
+
+            // Attachment service uses the same auth as pageserver: if JWT is enabled
+            // for us, we will also need it to talk to them.
+            if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
+                let jwt_token = self
+                    .env
+                    .generate_auth_token(&Claims::new(None, Scope::PageServerApi))
+                    .unwrap();
+                overrides.push(format!("control_plane_api_token='{}'", jwt_token));
+            }
        }

        if !cli_overrides
@@ -301,16 +313,8 @@ impl PageServerNode {
    pub async fn tenant_list(&self) -> mgmt_api::Result<Vec<TenantInfo>> {
        self.http_client.list_tenants().await
    }
-
-    pub async fn tenant_create(
-        &self,
-        new_tenant_id: TenantId,
-        generation: Option<u32>,
-        settings: HashMap<&str, &str>,
-    ) -> anyhow::Result<TenantId> {
-        let mut settings = settings.clone();
-
-        let config = models::TenantConfig {
+    pub fn parse_config(mut settings: HashMap<&str, &str>) -> anyhow::Result<models::TenantConfig> {
+        let result = models::TenantConfig {
            checkpoint_distance: settings
                .remove("checkpoint_distance")
                .map(|x| x.parse::<u64>())
@@ -371,11 +375,26 @@ impl PageServerNode {
                .context("Failed to parse 'gc_feedback' as bool")?,
            heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
        };
+        if !settings.is_empty() {
+            bail!("Unrecognized tenant settings: {settings:?}")
+        } else {
+            Ok(result)
+        }
+    }
+
+    pub async fn tenant_create(
+        &self,
+        new_tenant_id: TenantId,
+        generation: Option<u32>,
+        settings: HashMap<&str, &str>,
+    ) -> anyhow::Result<TenantId> {
+        let config = Self::parse_config(settings.clone())?;

        let request = models::TenantCreateRequest {
            new_tenant_id: TenantShardId::unsharded(new_tenant_id),
            generation,
            config,
+            shard_parameters: ShardParameters::default(),
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
@@ -471,18 +490,21 @@ impl PageServerNode {

    pub async fn location_config(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        config: LocationConfig,
        flush_ms: Option<Duration>,
    ) -> anyhow::Result<()> {
        Ok(self
            .http_client
-            .location_config(tenant_id, config, flush_ms)
+            .location_config(tenant_shard_id, config, flush_ms)
            .await?)
    }

-    pub async fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
-        Ok(self.http_client.list_timelines(*tenant_id).await?)
+    pub async fn timeline_list(
+        &self,
+        tenant_shard_id: &TenantShardId,
+    ) -> anyhow::Result<Vec<TimelineInfo>> {
+        Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
    }

    pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
@@ -494,15 +516,13 @@ impl PageServerNode {

    pub async fn timeline_create(
        &self,
-        tenant_id: TenantId,
-        new_timeline_id: Option<TimelineId>,
+        tenant_shard_id: TenantShardId,
+        new_timeline_id: TimelineId,
        ancestor_start_lsn: Option<Lsn>,
        ancestor_timeline_id: Option<TimelineId>,
        pg_version: Option<u32>,
        existing_initdb_timeline_id: Option<TimelineId>,
    ) -> anyhow::Result<TimelineInfo> {
-        // If timeline ID was not specified, generate one
-        let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
        let req = models::TimelineCreateRequest {
            new_timeline_id,
            ancestor_start_lsn,
@@ -510,7 +530,10 @@ impl PageServerNode {
            pg_version,
            existing_initdb_timeline_id,
        };
-        Ok(self.http_client.timeline_create(tenant_id, &req).await?)
+        Ok(self
+            .http_client
+            .timeline_create(tenant_shard_id, &req)
+            .await?)
    }

    /// Import a basebackup prepared using either:
@@ -588,4 +611,14 @@ impl PageServerNode {

        Ok(())
    }
+
+    pub async fn tenant_synthetic_size(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> anyhow::Result<TenantHistorySize> {
+        Ok(self
+            .http_client
+            .tenant_synthetic_size(tenant_shard_id)
+            .await?)
+    }
 }
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -1,220 +0,0 @@
-//!
-//! Functionality for migrating tenants across pageservers: unlike most of neon_local, this code
-//! isn't scoped to a particular physical service, as it needs to update compute endpoints to
-//! point to the new pageserver.
-//!
-use crate::local_env::LocalEnv;
-use crate::{
-    attachment_service::AttachmentService, endpoint::ComputeControlPlane,
-    pageserver::PageServerNode,
-};
-use pageserver_api::models::{
-    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
-};
-use pageserver_api::shard::TenantShardId;
-use std::collections::HashMap;
-use std::time::Duration;
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
-
-/// Given an attached pageserver, retrieve the LSN for all timelines
-async fn get_lsns(
-    tenant_id: TenantId,
-    pageserver: &PageServerNode,
-) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
-    let timelines = pageserver.timeline_list(&tenant_id).await?;
-    Ok(timelines
-        .into_iter()
-        .map(|t| (t.timeline_id, t.last_record_lsn))
-        .collect())
-}
-
-/// Wait for the timeline LSNs on `pageserver` to catch up with or overtake
-/// `baseline`.
-async fn await_lsn(
-    tenant_id: TenantId,
-    pageserver: &PageServerNode,
-    baseline: HashMap<TimelineId, Lsn>,
-) -> anyhow::Result<()> {
-    loop {
-        let latest = match get_lsns(tenant_id, pageserver).await {
-            Ok(l) => l,
-            Err(_e) => {
-                println!(
-                    "🕑 Waiting for pageserver {} to activate...",
-                    pageserver.conf.id
-                );
-                std::thread::sleep(Duration::from_millis(500));
-                continue;
-            }
-        };
-
-        let mut any_behind: bool = false;
-        for (timeline_id, baseline_lsn) in &baseline {
-            match latest.get(timeline_id) {
-                Some(latest_lsn) => {
-                    println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
-                    if latest_lsn < baseline_lsn {
-                        any_behind = true;
-                    }
-                }
-                None => {
-                    // Expected timeline isn't yet visible on migration destination.
-                    // (IRL we would have to account for timeline deletion, but this
-                    //  is just test helper)
-                    any_behind = true;
-                }
-            }
-        }
-
-        if !any_behind {
-            println!("✅ LSN caught up.  Proceeding...");
-            break;
-        } else {
-            std::thread::sleep(Duration::from_millis(500));
-        }
-    }
-
-    Ok(())
-}
-
-/// This function spans multiple services, to demonstrate live migration of a tenant
-/// between pageservers:
-///  - Coordinate attach/secondary/detach on pageservers
-///  - call into attachment_service for generations
-///  - reconfigure compute endpoints to point to new attached pageserver
-pub async fn migrate_tenant(
-    env: &LocalEnv,
-    tenant_id: TenantId,
-    dest_ps: PageServerNode,
-) -> anyhow::Result<()> {
-    println!("🤔 Checking existing status...");
-    let attachment_service = AttachmentService::from_env(env);
-
-    fn build_location_config(
-        mode: LocationConfigMode,
-        generation: Option<u32>,
-        secondary_conf: Option<LocationConfigSecondary>,
-    ) -> LocationConfig {
-        LocationConfig {
-            mode,
-            generation,
-            secondary_conf,
-            tenant_conf: TenantConfig::default(),
-            shard_number: 0,
-            shard_count: 0,
-            shard_stripe_size: 0,
-        }
-    }
-
-    let previous = attachment_service.inspect(tenant_id).await?;
-    let mut baseline_lsns = None;
-    if let Some((generation, origin_ps_id)) = &previous {
-        let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?);
-
-        if origin_ps_id == &dest_ps.conf.id {
-            println!("🔁 Already attached to {origin_ps_id}, freshening...");
-            let gen = attachment_service
-                .attach_hook(tenant_id, dest_ps.conf.id)
-                .await?;
-            let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
-            dest_ps.location_config(tenant_id, dest_conf, None).await?;
-            println!("✅ Migration complete");
-            return Ok(());
-        }
-
-        println!("🔁 Switching origin pageserver {origin_ps_id} to stale mode");
-
-        let stale_conf =
-            build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
-        origin_ps
-            .location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))
-            .await?;
-
-        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
-    }
-
-    println!(
-        "🔁 Downloading latest layers to destination pageserver {}",
-        dest_ps.conf.id
-    );
-    match dest_ps
-        .tenant_secondary_download(&TenantShardId::unsharded(tenant_id))
-        .await
-    {
-        Ok(()) => {}
-        Err(_) => {
-            println!("  (skipping, destination wasn't in secondary mode)")
-        }
-    }
-
-    let gen = attachment_service
-        .attach_hook(tenant_id, dest_ps.conf.id)
-        .await?;
-    let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);
-
-    println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
-    dest_ps.location_config(tenant_id, dest_conf, None).await?;
-
-    if let Some(baseline) = baseline_lsns {
-        println!("🕑 Waiting for LSN to catch up...");
-        await_lsn(tenant_id, &dest_ps, baseline).await?;
-    }
-
-    let cplane = ComputeControlPlane::load(env.clone())?;
-    for (endpoint_name, endpoint) in &cplane.endpoints {
-        if endpoint.tenant_id == tenant_id {
-            println!(
-                "🔁 Reconfiguring endpoint {} to use pageserver {}",
-                endpoint_name, dest_ps.conf.id
-            );
-            endpoint.reconfigure(Some(dest_ps.conf.id)).await?;
-        }
-    }
-
-    for other_ps_conf in &env.pageservers {
-        if other_ps_conf.id == dest_ps.conf.id {
-            continue;
-        }
-
-        let other_ps = PageServerNode::from_env(env, other_ps_conf);
-        let other_ps_tenants = other_ps.tenant_list().await?;
-
-        // Check if this tenant is attached
-        let found = other_ps_tenants
-            .into_iter()
-            .map(|t| t.id)
-            .any(|i| i.tenant_id == tenant_id);
-        if !found {
-            continue;
-        }
-
-        // Downgrade to a secondary location
-        let secondary_conf = build_location_config(
-            LocationConfigMode::Secondary,
-            None,
-            Some(LocationConfigSecondary { warm: true }),
-        );
-
-        println!(
-            "💤 Switching to secondary mode on pageserver {}",
-            other_ps.conf.id
-        );
-        other_ps
-            .location_config(tenant_id, secondary_conf, None)
-            .await?;
-    }
-
-    println!(
-        "🔁 Switching to AttachedSingle mode on pageserver {}",
-        dest_ps.conf.id
-    );
-    let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
-    dest_ps.location_config(tenant_id, dest_conf, None).await?;
-
-    println!("✅ Migration complete");
-
-    Ok(())
-}
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -75,6 +75,10 @@ pub struct ComputeSpec {
    pub remote_extensions: Option<RemoteExtSpec>,

    pub pgbouncer_settings: Option<HashMap<String, String>>,
+
+    // Stripe size for pageserver sharding, in pages
+    #[serde(default)]
+    pub shard_stripe_size: Option<usize>,
 }

 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -82,10 +86,13 @@ pub struct ComputeSpec {
 #[serde(rename_all = "snake_case")]
 pub enum ComputeFeature {
    // XXX: Add more feature flags here.
+    /// Enable the experimental activity monitor logic, which uses `pg_stat_database` to
+    /// track short-lived connections as user activity.
+    ActivityMonitorExperimental,

-    // This is a special feature flag that is used to represent unknown feature flags.
-    // Basically all unknown to enum flags are represented as this one. See unit test
-    // `parse_unknown_features()` for more details.
+    /// This is a special feature flag that is used to represent unknown feature flags.
+    /// Basically all unknown to enum flags are represented as this one. See unit test
+    /// `parse_unknown_features()` for more details.
    #[serde(other)]
    UnknownFeature,
 }
@@ -282,4 +289,23 @@ mod tests {
        assert!(spec.features.contains(&ComputeFeature::UnknownFeature));
        assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]);
    }
+
+    #[test]
+    fn parse_known_features() {
+        // Test that we can properly parse known feature flags.
+        let file = File::open("tests/cluster_spec.json").unwrap();
+        let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
+        let ob = json.as_object_mut().unwrap();
+
+        // Add known feature flags.
+        let features = vec!["activity_monitor_experimental"];
+        ob.insert("features".into(), features.into());
+
+        let spec: ComputeSpec = serde_json::from_value(json).unwrap();
+
+        assert_eq!(
+            spec.features,
+            vec![ComputeFeature::ActivityMonitorExperimental]
+        );
+    }
 }
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -19,6 +19,7 @@ strum.workspace = true
 strum_macros.workspace = true
 hex.workspace = true
 thiserror.workspace = true
+humantime-serde.workspace = true

 workspace_hack.workspace = true

--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -3,6 +3,8 @@ use byteorder::{ByteOrder, BE};
 use serde::{Deserialize, Serialize};
 use std::fmt;

+use crate::reltag::{BlockNumber, RelTag};
+
 /// Key used in the Repository kv-store.
 ///
 /// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs
@@ -146,6 +148,22 @@ pub fn is_rel_block_key(key: &Key) -> bool {
    key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
 }

+/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
+pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
+    Ok(match key.field1 {
+        0x00 => (
+            RelTag {
+                spcnode: key.field2,
+                dbnode: key.field3,
+                relnode: key.field4,
+                forknum: key.field5,
+            },
+            key.field6,
+        ),
+        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
+    })
+}
+
 impl std::str::FromStr for Key {
    type Err = anyhow::Error;

--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -4,7 +4,7 @@ use std::{
    collections::HashMap,
    io::{BufRead, Read},
    num::{NonZeroU64, NonZeroUsize},
-    time::SystemTime,
+    time::{Duration, SystemTime},
 };

 use byteorder::{BigEndian, ReadBytesExt};
@@ -18,7 +18,10 @@ use utils::{
    lsn::Lsn,
 };

-use crate::{reltag::RelTag, shard::TenantShardId};
+use crate::{
+    reltag::RelTag,
+    shard::{ShardCount, ShardStripeSize, TenantShardId},
+};
 use anyhow::bail;
 use bytes::{Buf, BufMut, Bytes, BytesMut};

@@ -188,6 +191,31 @@ pub struct TimelineCreateRequest {
    pub pg_version: Option<u32>,
 }

+/// Parameters that apply to all shards in a tenant.  Used during tenant creation.
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct ShardParameters {
+    pub count: ShardCount,
+    pub stripe_size: ShardStripeSize,
+}
+
+impl ShardParameters {
+    pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
+
+    pub fn is_unsharded(&self) -> bool {
+        self.count == ShardCount(0)
+    }
+}
+
+impl Default for ShardParameters {
+    fn default() -> Self {
+        Self {
+            count: ShardCount(0),
+            stripe_size: Self::DEFAULT_STRIPE_SIZE,
+        }
+    }
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantCreateRequest {
@@ -195,6 +223,12 @@ pub struct TenantCreateRequest {
    #[serde(default)]
    #[serde(skip_serializing_if = "Option::is_none")]
    pub generation: Option<u32>,
+
+    // If omitted, create a single shard with TenantShardId::unsharded()
+    #[serde(default)]
+    #[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
+    pub shard_parameters: ShardParameters,
+
    #[serde(flatten)]
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
@@ -217,7 +251,7 @@ impl std::ops::Deref for TenantCreateRequest {

 /// An alternative representation of `pageserver::tenant::TenantConf` with
 /// simpler types.
-#[derive(Serialize, Deserialize, Debug, Default)]
+#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
 pub struct TenantConfig {
    pub checkpoint_distance: Option<u64>,
    pub checkpoint_timeout: Option<String>,
@@ -232,21 +266,41 @@ pub struct TenantConfig {
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
    pub trace_read_requests: Option<bool>,
-    // We defer the parsing of the eviction_policy field to the request handler.
-    // Otherwise we'd have to move the types for eviction policy into this package.
-    // We might do that once the eviction feature has stabilizied.
-    // For now, this field is not even documented in the openapi_spec.yml.
-    pub eviction_policy: Option<serde_json::Value>,
+    pub eviction_policy: Option<EvictionPolicy>,
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
    pub gc_feedback: Option<bool>,
    pub heatmap_period: Option<String>,
 }

+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(tag = "kind")]
+pub enum EvictionPolicy {
+    NoEviction,
+    LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
+}
+
+impl EvictionPolicy {
+    pub fn discriminant_str(&self) -> &'static str {
+        match self {
+            EvictionPolicy::NoEviction => "NoEviction",
+            EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub struct EvictionPolicyLayerAccessThreshold {
+    #[serde(with = "humantime_serde")]
+    pub period: Duration,
+    #[serde(with = "humantime_serde")]
+    pub threshold: Duration,
+}
+
 /// A flattened analog of a `pagesever::tenant::LocationMode`, which
 /// lists out all possible states (and the virtual "Detached" state)
 /// in a flat form rather than using rust-style enums.
-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
 pub enum LocationConfigMode {
    AttachedSingle,
    AttachedMulti,
@@ -255,19 +309,21 @@ pub enum LocationConfigMode {
    Detached,
 }

-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
 pub struct LocationConfigSecondary {
    pub warm: bool,
 }

 /// An alternative representation of `pageserver::tenant::LocationConf`,
 /// for use in external-facing APIs.
-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
 pub struct LocationConfig {
    pub mode: LocationConfigMode,
    /// If attaching, in what generation?
    #[serde(default)]
    pub generation: Option<u32>,
+
+    // If requesting mode `Secondary`, configuration for that.
    #[serde(default)]
    pub secondary_conf: Option<LocationConfigSecondary>,

@@ -280,11 +336,17 @@ pub struct LocationConfig {
    #[serde(default)]
    pub shard_stripe_size: u32,

-    // If requesting mode `Secondary`, configuration for that.
-    // Custom storage configuration for the tenant, if any
+    // This configuration only affects attached mode, but should be provided irrespective
+    // of the mode, as a secondary location might transition on startup if the response
+    // to the `/re-attach` control plane API requests it.
    pub tenant_conf: TenantConfig,
 }

+#[derive(Serialize, Deserialize)]
+pub struct LocationConfigListResponse {
+    pub tenant_shards: Vec<(TenantShardId, Option<LocationConfig>)>,
+}
+
 #[derive(Serialize, Deserialize)]
 #[serde(transparent)]
 pub struct TenantCreateResponse(pub TenantId);
@@ -297,7 +359,7 @@ pub struct StatusResponse {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
-    pub tenant_id: TenantId,
+    pub tenant_id: TenantShardId,
    #[serde(flatten)]
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
@@ -368,6 +430,8 @@ pub struct TenantInfo {
    /// If a layer is present in both local FS and S3, it counts only once.
    pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
    pub attachment_status: TenantAttachmentStatus,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub generation: Option<u32>,
 }

 #[derive(Serialize, Deserialize, Clone)]
@@ -658,6 +722,17 @@ pub struct PagestreamDbSizeResponse {
    pub db_size: i64,
 }

+// This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
+// that require pageserver-internal types.  It is sufficient to get the total size.
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantHistorySize {
+    pub id: TenantId,
+    /// Size is a mixture of WAL and logical size, so the unit is bytes.
+    ///
+    /// Will be none if `?inputs_only=true` was given.
+    pub size: Option<u64>,
+}
+
 impl PagestreamFeMessage {
    pub fn serialize(&self) -> Bytes {
        let mut bytes = BytesMut::new();
@@ -910,6 +985,7 @@ mod tests {
            state: TenantState::Active,
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
+            generation: None,
        };
        let expected_active = json!({
            "id": original_active.id.to_string(),
@@ -930,6 +1006,7 @@ mod tests {
            },
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
+            generation: None,
        };
        let expected_broken = json!({
            "id": original_broken.id.to_string(),
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -32,6 +32,9 @@ pub struct RelTag {
    pub relnode: Oid,
 }

+/// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type.
+pub type BlockNumber = u32;
+
 impl PartialOrd for RelTag {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,6 +1,9 @@
 use std::{ops::RangeInclusive, str::FromStr};

-use crate::key::{is_rel_block_key, Key};
+use crate::{
+    key::{is_rel_block_key, Key},
+    models::ShardParameters,
+};
 use hex::FromHex;
 use serde::{Deserialize, Serialize};
 use thiserror;
@@ -85,6 +88,12 @@ impl TenantShardId {
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
    }
+    pub fn to_index(&self) -> ShardIndex {
+        ShardIndex {
+            shard_number: self.shard_number,
+            shard_count: self.shard_count,
+        }
+    }
 }

 /// Formatting helper
@@ -333,7 +342,7 @@ const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
 pub struct ShardIdentity {
    pub number: ShardNumber,
    pub count: ShardCount,
-    stripe_size: ShardStripeSize,
+    pub stripe_size: ShardStripeSize,
    layout: ShardLayout,
 }

@@ -403,6 +412,17 @@ impl ShardIdentity {
        }
    }

+    /// For use when creating ShardIdentity instances for new shards, where a creation request
+    /// specifies the ShardParameters that apply to all shards.
+    pub fn from_params(number: ShardNumber, params: &ShardParameters) -> Self {
+        Self {
+            number,
+            count: params.count,
+            layout: LAYOUT_V1,
+            stripe_size: params.stripe_size,
+        }
+    }
+
    fn is_broken(&self) -> bool {
        self.layout == LAYOUT_BROKEN
    }
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -15,7 +15,11 @@ aws-sdk-s3.workspace = true
 aws-credential-types.workspace = true
 bytes.workspace = true
 camino.workspace = true
-hyper = { workspace = true, features = ["stream"] }
+hyper = { workspace = true, features = [] }
+hyper-util = { workspace = true, features = [] }
+http = { workspace = true, features = [] }
+http-body = { workspace = true, features = [] }
+http-body-util = { workspace = true, features = [] }
 futures.workspace = true
 serde.workspace = true
 serde_json.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -5,7 +5,9 @@ use std::collections::HashMap;
 use std::env;
 use std::num::NonZeroU32;
 use std::pin::Pin;
+use std::str::FromStr;
 use std::sync::Arc;
+use std::time::Duration;

 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
@@ -13,12 +15,14 @@ use azure_core::request_options::{MaxResults, Metadata, Range};
 use azure_core::RetryOptions;
 use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
+use azure_storage_blobs::blob::CopyStatus;
 use azure_storage_blobs::prelude::ClientBuilder;
 use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
 use bytes::Bytes;
 use futures::stream::Stream;
 use futures_util::StreamExt;
-use http_types::StatusCode;
+use http_types::{StatusCode, Url};
+use tokio::time::Instant;
 use tracing::debug;

 use crate::s3_bucket::RequestKind;
@@ -323,10 +327,49 @@ impl RemoteStorage for AzureBlobStorage {
        Ok(())
    }

-    async fn copy(&self, _from: &RemotePath, _to: &RemotePath) -> anyhow::Result<()> {
-        Err(anyhow::anyhow!(
-            "copy for azure blob storage is not implemented"
-        ))
+    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+        let _permit = self.permit(RequestKind::Copy).await;
+        let blob_client = self.client.blob_client(self.relative_path_to_name(to));
+
+        let source_url = format!(
+            "{}/{}",
+            self.client.url()?,
+            self.relative_path_to_name(from)
+        );
+        let builder = blob_client.copy(Url::from_str(&source_url)?);
+
+        let result = builder.into_future().await?;
+
+        let mut copy_status = result.copy_status;
+        let start_time = Instant::now();
+        const MAX_WAIT_TIME: Duration = Duration::from_secs(60);
+        loop {
+            match copy_status {
+                CopyStatus::Aborted => {
+                    anyhow::bail!("Received abort for copy from {from} to {to}.");
+                }
+                CopyStatus::Failed => {
+                    anyhow::bail!("Received failure response for copy from {from} to {to}.");
+                }
+                CopyStatus::Success => return Ok(()),
+                CopyStatus::Pending => (),
+            }
+            // The copy is taking longer. Waiting a second and then re-trying.
+            // TODO estimate time based on copy_progress and adjust time based on that
+            tokio::time::sleep(Duration::from_millis(1000)).await;
+            let properties = blob_client.get_properties().into_future().await?;
+            let Some(status) = properties.blob.properties.copy_status else {
+                tracing::warn!("copy_status for copy is None!, from={from}, to={to}");
+                return Ok(());
+            };
+            if start_time.elapsed() > MAX_WAIT_TIME {
+                anyhow::bail!("Copy from from {from} to {to} took longer than limit MAX_WAIT_TIME={}s. copy_pogress={:?}.",
+                    MAX_WAIT_TIME.as_secs_f32(),
+                    properties.blob.properties.copy_progress,
+                );
+            }
+            copy_status = status;
+        }
    }
 }

--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -36,7 +36,9 @@ use aws_smithy_types::body::SdkBody;
 use aws_smithy_types::byte_stream::ByteStream;
 use bytes::Bytes;
 use futures::stream::Stream;
-use hyper::Body;
+use futures_util::TryStreamExt;
+use http_body::Frame;
+use http_body_util::StreamBody;
 use scopeguard::ScopeGuard;

 use super::StorageMetadata;
@@ -469,8 +471,8 @@ impl RemoteStorage for S3Bucket {

        let started_at = start_measuring_requests(kind);

-        let body = Body::wrap_stream(from);
-        let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
+        let body = StreamBody::new(from.map_ok(Frame::data));
+        let bytes_stream = ByteStream::new(SdkBody::from_body_1_x(body));

        let res = self
            .client
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -0,0 +1,288 @@
+use anyhow::Context;
+use camino::Utf8Path;
+use remote_storage::RemotePath;
+use std::collections::HashSet;
+use std::sync::Arc;
+use test_context::test_context;
+use tracing::debug;
+
+use crate::common::{download_to_vec, upload_stream, wrap_stream};
+
+use super::{
+    MaybeEnabledStorage, MaybeEnabledStorageWithSimpleTestBlobs, MaybeEnabledStorageWithTestBlobs,
+};
+
+/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
+/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
+/// See the client creation in [`create_s3_client`] for details on the required env vars.
+/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
+/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
+///
+/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
+/// where
+/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
+/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
+///
+/// Then, verifies that the client does return correct prefixes when queried:
+/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
+/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
+///
+/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
+/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
+/// since current default AWS S3 pagination limit is 1000.
+/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
+///
+/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
+/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
+#[test_context(MaybeEnabledStorageWithTestBlobs)]
+#[tokio::test]
+async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorageWithTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledStorageWithTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledStorageWithTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("S3 init failed: {e:?}")
+        }
+    };
+
+    let test_client = Arc::clone(&ctx.enabled.client);
+    let expected_remote_prefixes = ctx.remote_prefixes.clone();
+
+    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
+        .context("common_prefix construction")?;
+    let root_remote_prefixes = test_client
+        .list_prefixes(None)
+        .await
+        .context("client list root prefixes failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
+        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
+    );
+
+    let nested_remote_prefixes = test_client
+        .list_prefixes(Some(&base_prefix))
+        .await
+        .context("client list nested prefixes failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    let remote_only_prefixes = nested_remote_prefixes
+        .difference(&expected_remote_prefixes)
+        .collect::<HashSet<_>>();
+    let missing_uploaded_prefixes = expected_remote_prefixes
+        .difference(&nested_remote_prefixes)
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
+        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
+    );
+
+    Ok(())
+}
+
+/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries.
+/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
+/// See `s3_pagination_should_work` for more information.
+///
+/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
+/// Then performs the following queries:
+///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
+///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
+#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
+#[tokio::test]
+async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("S3 init failed: {e:?}")
+        }
+    };
+    let test_client = Arc::clone(&ctx.enabled.client);
+    let base_prefix =
+        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
+    let root_files = test_client
+        .list_files(None)
+        .await
+        .context("client list root files failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        root_files,
+        ctx.remote_blobs.clone(),
+        "remote storage list_files on root mismatches with the uploads."
+    );
+    let nested_remote_files = test_client
+        .list_files(Some(&base_prefix))
+        .await
+        .context("client list nested files failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    let trim_remote_blobs: HashSet<_> = ctx
+        .remote_blobs
+        .iter()
+        .map(|x| x.get_path())
+        .filter(|x| x.starts_with("folder1"))
+        .map(|x| RemotePath::new(x).expect("must be valid path"))
+        .collect();
+    assert_eq!(
+        nested_remote_files, trim_remote_blobs,
+        "remote storage list_files on subdirrectory mismatches with the uploads."
+    );
+    Ok(())
+}
+
+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorage::Enabled(ctx) => ctx,
+        MaybeEnabledStorage::Disabled => return Ok(()),
+    };
+
+    let path = RemotePath::new(Utf8Path::new(
+        format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
+    ))
+    .with_context(|| "RemotePath conversion")?;
+
+    ctx.client.delete(&path).await.expect("should succeed");
+
+    Ok(())
+}
+
+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorage::Enabled(ctx) => ctx,
+        MaybeEnabledStorage::Disabled => return Ok(()),
+    };
+
+    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
+    ctx.client.upload(data, len, &path1, None).await?;
+
+    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
+    ctx.client.upload(data, len, &path2, None).await?;
+
+    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
+    ctx.client.upload(data, len, &path3, None).await?;
+
+    ctx.client.delete_objects(&[path1, path2]).await?;
+
+    let prefixes = ctx.client.list_prefixes(None).await?;
+
+    assert_eq!(prefixes.len(), 1);
+
+    ctx.client.delete_objects(&[path3]).await?;
+
+    Ok(())
+}
+
+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
+    let MaybeEnabledStorage::Enabled(ctx) = ctx else {
+        return Ok(());
+    };
+
+    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
+
+    let (data, len) = wrap_stream(orig.clone());
+
+    ctx.client.upload(data, len, &path, None).await?;
+
+    // Normal download request
+    let dl = ctx.client.download(&path).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    // Full range (end specified)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 0, Some(len as u64))
+        .await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    // partial range (end specified)
+    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[4..10]);
+
+    // partial range (end beyond real end)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 8, Some(len as u64 * 100))
+        .await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[8..]);
+
+    // Partial range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[4..]);
+
+    // Full range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    debug!("Cleanup: deleting file at path {path:?}");
+    ctx.client
+        .delete(&path)
+        .await
+        .with_context(|| format!("{path:?} removal"))?;
+
+    Ok(())
+}
+
+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
+    let MaybeEnabledStorage::Enabled(ctx) = ctx else {
+        return Ok(());
+    };
+
+    let path = RemotePath::new(Utf8Path::new(
+        format!("{}/file_to_copy", ctx.base_prefix).as_str(),
+    ))
+    .with_context(|| "RemotePath conversion")?;
+    let path_dest = RemotePath::new(Utf8Path::new(
+        format!("{}/file_dest", ctx.base_prefix).as_str(),
+    ))
+    .with_context(|| "RemotePath conversion")?;
+
+    let orig = bytes::Bytes::from_static("remote blob data content".as_bytes());
+
+    let (data, len) = wrap_stream(orig.clone());
+
+    ctx.client.upload(data, len, &path, None).await?;
+
+    // Normal download request
+    ctx.client.copy_object(&path, &path_dest).await?;
+
+    let dl = ctx.client.download(&path_dest).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    debug!("Cleanup: deleting file at path {path:?}");
+    ctx.client
+        .delete_objects(&[path.clone(), path_dest.clone()])
+        .await
+        .with_context(|| format!("{path:?} removal"))?;
+
+    Ok(())
+}
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -6,263 +6,23 @@ use std::sync::Arc;
 use std::time::UNIX_EPOCH;

 use anyhow::Context;
-use camino::Utf8Path;
 use remote_storage::{
    AzureConfig, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
 };
-use test_context::{test_context, AsyncTestContext};
-use tracing::{debug, info};
+use test_context::AsyncTestContext;
+use tracing::info;

 mod common;

-use common::{
-    cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
-    upload_stream, wrap_stream,
-};
+#[path = "common/tests.rs"]
+mod tests_azure;
+
+use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data};

 const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE";

 const BASE_PREFIX: &str = "test";

-/// Tests that the Azure client can list all prefixes, even if the response comes paginated and requires multiple HTTP queries.
-/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified.
-/// See the client creation in [`create_azure_client`] for details on the required env vars.
-/// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
-/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
-///
-/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
-/// where
-/// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference
-/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
-///
-/// Then, verifies that the client does return correct prefixes when queried:
-/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
-/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
-///
-/// With the real Azure enabled and `#[cfg(test)]` Rust configuration used, the Azure client test adds a `max-keys` param to limit the response keys.
-/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to Azure.
-///
-/// Lastly, the test attempts to clean up and remove all uploaded Azure files.
-/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
-#[test_context(MaybeEnabledAzureWithTestBlobs)]
-#[tokio::test]
-async fn azure_pagination_should_work(
-    ctx: &mut MaybeEnabledAzureWithTestBlobs,
-) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledAzureWithTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledAzureWithTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledAzureWithTestBlobs::UploadsFailed(e, _) => {
-            anyhow::bail!("Azure init failed: {e:?}")
-        }
-    };
-
-    let test_client = Arc::clone(&ctx.enabled.client);
-    let expected_remote_prefixes = ctx.remote_prefixes.clone();
-
-    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
-        .context("common_prefix construction")?;
-    let root_remote_prefixes = test_client
-        .list_prefixes(None)
-        .await
-        .context("client list root prefixes failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
-        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
-    );
-
-    let nested_remote_prefixes = test_client
-        .list_prefixes(Some(&base_prefix))
-        .await
-        .context("client list nested prefixes failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    let remote_only_prefixes = nested_remote_prefixes
-        .difference(&expected_remote_prefixes)
-        .collect::<HashSet<_>>();
-    let missing_uploaded_prefixes = expected_remote_prefixes
-        .difference(&nested_remote_prefixes)
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
-        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
-    );
-
-    Ok(())
-}
-
-/// Tests that Azure client can list all files in a folder, even if the response comes paginated and requirees multiple Azure queries.
-/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set.
-/// See `Azure_pagination_should_work` for more information.
-///
-/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
-/// Then performs the following queries:
-///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
-///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
-#[test_context(MaybeEnabledAzureWithSimpleTestBlobs)]
-#[tokio::test]
-async fn azure_list_files_works(
-    ctx: &mut MaybeEnabledAzureWithSimpleTestBlobs,
-) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledAzureWithSimpleTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledAzureWithSimpleTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledAzureWithSimpleTestBlobs::UploadsFailed(e, _) => {
-            anyhow::bail!("Azure init failed: {e:?}")
-        }
-    };
-    let test_client = Arc::clone(&ctx.enabled.client);
-    let base_prefix =
-        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
-    let root_files = test_client
-        .list_files(None)
-        .await
-        .context("client list root files failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        root_files,
-        ctx.remote_blobs.clone(),
-        "remote storage list_files on root mismatches with the uploads."
-    );
-    let nested_remote_files = test_client
-        .list_files(Some(&base_prefix))
-        .await
-        .context("client list nested files failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    let trim_remote_blobs: HashSet<_> = ctx
-        .remote_blobs
-        .iter()
-        .map(|x| x.get_path())
-        .filter(|x| x.starts_with("folder1"))
-        .map(|x| RemotePath::new(x).expect("must be valid path"))
-        .collect();
-    assert_eq!(
-        nested_remote_files, trim_remote_blobs,
-        "remote storage list_files on subdirrectory mismatches with the uploads."
-    );
-    Ok(())
-}
-
-#[test_context(MaybeEnabledAzure)]
-#[tokio::test]
-async fn azure_delete_non_exising_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledAzure::Enabled(ctx) => ctx,
-        MaybeEnabledAzure::Disabled => return Ok(()),
-    };
-
-    let path = RemotePath::new(Utf8Path::new(
-        format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
-    ))
-    .with_context(|| "RemotePath conversion")?;
-
-    ctx.client.delete(&path).await.expect("should succeed");
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledAzure)]
-#[tokio::test]
-async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledAzure::Enabled(ctx) => ctx,
-        MaybeEnabledAzure::Disabled => return Ok(()),
-    };
-
-    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
-    ctx.client.upload(data, len, &path1, None).await?;
-
-    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
-    ctx.client.upload(data, len, &path2, None).await?;
-
-    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
-    ctx.client.upload(data, len, &path3, None).await?;
-
-    ctx.client.delete_objects(&[path1, path2]).await?;
-
-    let prefixes = ctx.client.list_prefixes(None).await?;
-
-    assert_eq!(prefixes.len(), 1);
-
-    ctx.client.delete_objects(&[path3]).await?;
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledAzure)]
-#[tokio::test]
-async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
-    let MaybeEnabledAzure::Enabled(ctx) = ctx else {
-        return Ok(());
-    };
-
-    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
-
-    let (data, len) = wrap_stream(orig.clone());
-
-    ctx.client.upload(data, len, &path, None).await?;
-
-    // Normal download request
-    let dl = ctx.client.download(&path).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    // Full range (end specified)
-    let dl = ctx
-        .client
-        .download_byte_range(&path, 0, Some(len as u64))
-        .await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    // partial range (end specified)
-    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[4..10]);
-
-    // partial range (end beyond real end)
-    let dl = ctx
-        .client
-        .download_byte_range(&path, 8, Some(len as u64 * 100))
-        .await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[8..]);
-
-    // Partial range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[4..]);
-
-    // Full range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    debug!("Cleanup: deleting file at path {path:?}");
-    ctx.client
-        .delete(&path)
-        .await
-        .with_context(|| format!("{path:?} removal"))?;
-
-    Ok(())
-}
-
 struct EnabledAzure {
    client: Arc<GenericRemoteStorage>,
    base_prefix: &'static str,
@@ -281,13 +41,13 @@ impl EnabledAzure {
    }
 }

-enum MaybeEnabledAzure {
+enum MaybeEnabledStorage {
    Enabled(EnabledAzure),
    Disabled,
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledAzure {
+impl AsyncTestContext for MaybeEnabledStorage {
    async fn setup() -> Self {
        ensure_logging_ready();

@@ -303,7 +63,7 @@ impl AsyncTestContext for MaybeEnabledAzure {
    }
 }

-enum MaybeEnabledAzureWithTestBlobs {
+enum MaybeEnabledStorageWithTestBlobs {
    Enabled(AzureWithTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, AzureWithTestBlobs),
@@ -316,7 +76,7 @@ struct AzureWithTestBlobs {
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs {
+impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
@@ -367,7 +127,7 @@ impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs {
 // However, they are not idential. The list_prefixes function is concerned with listing prefixes,
 // whereas the list_files function is concerned with listing files.
 // See `RemoteStorage::list_files` documentation for more details
-enum MaybeEnabledAzureWithSimpleTestBlobs {
+enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(AzureWithSimpleTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, AzureWithSimpleTestBlobs),
@@ -378,7 +138,7 @@ struct AzureWithSimpleTestBlobs {
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs {
+impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -6,259 +6,23 @@ use std::sync::Arc;
 use std::time::UNIX_EPOCH;

 use anyhow::Context;
-use camino::Utf8Path;
 use remote_storage::{
    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
-use test_context::{test_context, AsyncTestContext};
-use tracing::{debug, info};
+use test_context::AsyncTestContext;
+use tracing::info;

 mod common;

-use common::{
-    cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
-    upload_stream, wrap_stream,
-};
+#[path = "common/tests.rs"]
+mod tests_s3;
+
+use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data};

 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";

 const BASE_PREFIX: &str = "test";

-/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
-/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
-/// See the client creation in [`create_s3_client`] for details on the required env vars.
-/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
-/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
-///
-/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
-/// where
-/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
-/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
-///
-/// Then, verifies that the client does return correct prefixes when queried:
-/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
-/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
-///
-/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
-/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
-/// since current default AWS S3 pagination limit is 1000.
-/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
-///
-/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
-/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
-#[test_context(MaybeEnabledS3WithTestBlobs)]
-#[tokio::test]
-async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledS3WithTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledS3WithTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledS3WithTestBlobs::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
-    };
-
-    let test_client = Arc::clone(&ctx.enabled.client);
-    let expected_remote_prefixes = ctx.remote_prefixes.clone();
-
-    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
-        .context("common_prefix construction")?;
-    let root_remote_prefixes = test_client
-        .list_prefixes(None)
-        .await
-        .context("client list root prefixes failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
-        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
-    );
-
-    let nested_remote_prefixes = test_client
-        .list_prefixes(Some(&base_prefix))
-        .await
-        .context("client list nested prefixes failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    let remote_only_prefixes = nested_remote_prefixes
-        .difference(&expected_remote_prefixes)
-        .collect::<HashSet<_>>();
-    let missing_uploaded_prefixes = expected_remote_prefixes
-        .difference(&nested_remote_prefixes)
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
-        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
-    );
-
-    Ok(())
-}
-
-/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries.
-/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
-/// See `s3_pagination_should_work` for more information.
-///
-/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
-/// Then performs the following queries:
-///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
-///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
-#[test_context(MaybeEnabledS3WithSimpleTestBlobs)]
-#[tokio::test]
-async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledS3WithSimpleTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledS3WithSimpleTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledS3WithSimpleTestBlobs::UploadsFailed(e, _) => {
-            anyhow::bail!("S3 init failed: {e:?}")
-        }
-    };
-    let test_client = Arc::clone(&ctx.enabled.client);
-    let base_prefix =
-        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
-    let root_files = test_client
-        .list_files(None)
-        .await
-        .context("client list root files failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        root_files,
-        ctx.remote_blobs.clone(),
-        "remote storage list_files on root mismatches with the uploads."
-    );
-    let nested_remote_files = test_client
-        .list_files(Some(&base_prefix))
-        .await
-        .context("client list nested files failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    let trim_remote_blobs: HashSet<_> = ctx
-        .remote_blobs
-        .iter()
-        .map(|x| x.get_path())
-        .filter(|x| x.starts_with("folder1"))
-        .map(|x| RemotePath::new(x).expect("must be valid path"))
-        .collect();
-    assert_eq!(
-        nested_remote_files, trim_remote_blobs,
-        "remote storage list_files on subdirrectory mismatches with the uploads."
-    );
-    Ok(())
-}
-
-#[test_context(MaybeEnabledS3)]
-#[tokio::test]
-async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledS3::Enabled(ctx) => ctx,
-        MaybeEnabledS3::Disabled => return Ok(()),
-    };
-
-    let path = RemotePath::new(Utf8Path::new(
-        format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
-    ))
-    .with_context(|| "RemotePath conversion")?;
-
-    ctx.client.delete(&path).await.expect("should succeed");
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledS3)]
-#[tokio::test]
-async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledS3::Enabled(ctx) => ctx,
-        MaybeEnabledS3::Disabled => return Ok(()),
-    };
-
-    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
-    ctx.client.upload(data, len, &path1, None).await?;
-
-    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
-    ctx.client.upload(data, len, &path2, None).await?;
-
-    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
-    ctx.client.upload(data, len, &path3, None).await?;
-
-    ctx.client.delete_objects(&[path1, path2]).await?;
-
-    let prefixes = ctx.client.list_prefixes(None).await?;
-
-    assert_eq!(prefixes.len(), 1);
-
-    ctx.client.delete_objects(&[path3]).await?;
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledS3)]
-#[tokio::test]
-async fn s3_upload_download_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
-    let MaybeEnabledS3::Enabled(ctx) = ctx else {
-        return Ok(());
-    };
-
-    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
-
-    let (data, len) = wrap_stream(orig.clone());
-
-    ctx.client.upload(data, len, &path, None).await?;
-
-    // Normal download request
-    let dl = ctx.client.download(&path).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    // Full range (end specified)
-    let dl = ctx
-        .client
-        .download_byte_range(&path, 0, Some(len as u64))
-        .await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    // partial range (end specified)
-    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[4..10]);
-
-    // partial range (end beyond real end)
-    let dl = ctx
-        .client
-        .download_byte_range(&path, 8, Some(len as u64 * 100))
-        .await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[8..]);
-
-    // Partial range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[4..]);
-
-    // Full range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    debug!("Cleanup: deleting file at path {path:?}");
-    ctx.client
-        .delete(&path)
-        .await
-        .with_context(|| format!("{path:?} removal"))?;
-
-    Ok(())
-}
-
 struct EnabledS3 {
    client: Arc<GenericRemoteStorage>,
    base_prefix: &'static str,
@@ -277,13 +41,13 @@ impl EnabledS3 {
    }
 }

-enum MaybeEnabledS3 {
+enum MaybeEnabledStorage {
    Enabled(EnabledS3),
    Disabled,
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledS3 {
+impl AsyncTestContext for MaybeEnabledStorage {
    async fn setup() -> Self {
        ensure_logging_ready();

@@ -299,7 +63,7 @@ impl AsyncTestContext for MaybeEnabledS3 {
    }
 }

-enum MaybeEnabledS3WithTestBlobs {
+enum MaybeEnabledStorageWithTestBlobs {
    Enabled(S3WithTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, S3WithTestBlobs),
@@ -312,7 +76,7 @@ struct S3WithTestBlobs {
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
+impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
@@ -363,7 +127,7 @@ impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
 // However, they are not idential. The list_prefixes function is concerned with listing prefixes,
 // whereas the list_files function is concerned with listing files.
 // See `RemoteStorage::list_files` documentation for more details
-enum MaybeEnabledS3WithSimpleTestBlobs {
+enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(S3WithSimpleTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, S3WithSimpleTestBlobs),
@@ -374,7 +138,7 @@ struct S3WithSimpleTestBlobs {
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
+impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
--- a/libs/tracing-utils/src/http.rs
+++ b/libs/tracing-utils/src/http.rs
@@ -1,7 +1,8 @@
 //! Tracing wrapper for Hyper HTTP server

+use hyper::body::Body;
 use hyper::HeaderMap;
-use hyper::{Body, Request, Response};
+use hyper::{Request, Response};
 use std::future::Future;
 use tracing::Instrument;
 use tracing_opentelemetry::OpenTelemetrySpanExt;
@@ -35,14 +36,14 @@ pub enum OtelName<'a> {
 /// instrumentation libraries at:
 /// <https://opentelemetry.io/registry/?language=rust&component=instrumentation>
 /// If a Hyper crate appears, consider switching to that.
-pub async fn tracing_handler<F, R>(
-    req: Request<Body>,
+pub async fn tracing_handler<B1: Body, B2: Body, F, R>(
+    req: Request<B1>,
    handler: F,
    otel_name: OtelName<'_>,
-) -> Response<Body>
+) -> Response<B2>
 where
-    F: Fn(Request<Body>) -> R,
-    R: Future<Output = Response<Body>>,
+    F: Fn(Request<B1>) -> R,
+    R: Future<Output = Response<B2>>,
 {
    // Create a tracing span, with context propagated from the incoming
    // request if any.
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -22,6 +22,7 @@ chrono.workspace = true
 heapless.workspace = true
 hex = { workspace = true, features = ["serde"] }
 hyper = { workspace = true, features = ["full"] }
+http-body-util = { workspace = true, features = [] }
 fail.workspace = true
 futures = { workspace = true}
 jsonwebtoken.workspace = true
--- a/libs/utils/src/failpoint_support.rs
+++ b/libs/utils/src/failpoint_support.rs
@@ -4,7 +4,10 @@ use crate::http::{
    error::ApiError,
    json::{json_request, json_response},
 };
-use hyper::{Body, Request, Response, StatusCode};
+use bytes::Bytes;
+use http_body_util::Full;
+use hyper::{Request, Response, StatusCode};
+use routerify::Body;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -151,7 +154,7 @@ pub struct FailpointConfig {
 pub async fn failpoints_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
+) -> Result<Response<Full<Bytes>>, ApiError> {
    if !fail::has_failpoints() {
        return Err(ApiError::BadRequest(anyhow::anyhow!(
            "Cannot manage failpoints because storage was compiled without failpoints support"
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -4,11 +4,11 @@ use anyhow::Context;
 use hyper::header::{HeaderName, AUTHORIZATION};
 use hyper::http::HeaderValue;
 use hyper::Method;
-use hyper::{header::CONTENT_TYPE, Body, Request, Response};
+use hyper::{header::CONTENT_TYPE, Request, Response};
 use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
-use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
+use routerify::{Body, Middleware, RequestInfo, Router, RouterBuilder};
 use tracing::{self, debug, info, info_span, warn, Instrument};

 use std::future::Future;
@@ -238,7 +238,7 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body

    let (tx, rx) = mpsc::channel(1);

-    let body = Body::wrap_stream(ReceiverStream::new(rx));
+    let body = Body::from_stream(ReceiverStream::new(rx));

    let mut writer = ChannelWriter::new(128 * 1024, tx);

@@ -284,7 +284,7 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body
    Ok(response)
 }

-pub fn add_request_id_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
+pub fn add_request_id_middleware<B: hyper::body::Body + Send + Sync + 'static>(
 ) -> Middleware<B, ApiError> {
    Middleware::pre(move |req| async move {
        let request_id = match req.headers().get(&X_REQUEST_ID_HEADER) {
@@ -317,7 +317,7 @@ async fn add_request_id_header_to_response(
    Ok(res)
 }

-pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
+pub fn make_router() -> RouterBuilder<routerify::Body, ApiError> {
    Router::builder()
        .middleware(add_request_id_middleware())
        .middleware(Middleware::post_with_info(
@@ -328,11 +328,11 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
 }

 pub fn attach_openapi_ui(
-    router_builder: RouterBuilder<hyper::Body, ApiError>,
+    router_builder: RouterBuilder<routerify::Body, ApiError>,
    spec: &'static [u8],
    spec_mount_path: &'static str,
    ui_mount_path: &'static str,
-) -> RouterBuilder<hyper::Body, ApiError> {
+) -> RouterBuilder<routerify::Body, ApiError> {
    router_builder
        .get(spec_mount_path,
            move |r| request_span(r, move |_| async move {
@@ -388,7 +388,7 @@ fn parse_token(header_value: &str) -> Result<&str, ApiError> {
    Ok(token)
 }

-pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
+pub fn auth_middleware<B: hyper::body::Body + Send + Sync + 'static>(
    provide_auth: fn(&Request<Body>) -> Option<&SwappableJwtAuth>,
 ) -> Middleware<B, ApiError> {
    Middleware::pre(move |req| async move {
@@ -423,7 +423,7 @@ pub fn add_response_header_middleware<B>(
    value: &str,
 ) -> anyhow::Result<Middleware<B, ApiError>>
 where
-    B: hyper::body::HttpBody + Send + Sync + 'static,
+    B: hyper::body::Body + Send + Sync + 'static,
 {
    let name =
        HeaderName::from_str(header).with_context(|| format!("invalid header name: {header}"))?;
@@ -464,7 +464,6 @@ pub fn check_permission_with(
 #[cfg(test)]
 mod tests {
    use super::*;
-    use futures::future::poll_fn;
    use hyper::service::Service;
    use routerify::RequestServiceBuilder;
    use std::net::{IpAddr, SocketAddr};
@@ -473,16 +472,13 @@ mod tests {
    async fn test_request_id_returned() {
        let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap();
        let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80);
-        let mut service = builder.build(remote_addr);
-        if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await {
-            panic!("request service is not ready: {:?}", e);
-        }
+        let service = builder.build(remote_addr);

        let mut req: Request<Body> = Request::default();
        req.headers_mut()
            .append(&X_REQUEST_ID_HEADER, HeaderValue::from_str("42").unwrap());

-        let resp: Response<hyper::body::Body> = service.call(req).await.unwrap();
+        let resp: Response<Body> = service.call(req).await.unwrap();

        let header_val = resp.headers().get(&X_REQUEST_ID_HEADER).unwrap();

@@ -493,13 +489,10 @@ mod tests {
    async fn test_request_id_empty() {
        let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap();
        let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80);
-        let mut service = builder.build(remote_addr);
-        if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await {
-            panic!("request service is not ready: {:?}", e);
-        }
+        let service = builder.build(remote_addr);

        let req: Request<Body> = Request::default();
-        let resp: Response<hyper::body::Body> = service.call(req).await.unwrap();
+        let resp: Response<Body> = service.call(req).await.unwrap();

        let header_val = resp.headers().get(&X_REQUEST_ID_HEADER);

--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -1,4 +1,5 @@
-use hyper::{header, Body, Response, StatusCode};
+use hyper::{header, Response, StatusCode};
+use routerify::Body;
 use serde::{Deserialize, Serialize};
 use std::borrow::Cow;
 use std::error::Error as StdError;
--- a/libs/utils/src/http/json.rs
+++ b/libs/utils/src/http/json.rs
@@ -1,6 +1,8 @@
-use anyhow::Context;
-use bytes::Buf;
-use hyper::{header, Body, Request, Response, StatusCode};
+use anyhow::{anyhow, Context};
+use bytes::{Buf, Bytes};
+use http_body_util::{BodyExt, Full};
+use hyper::{header, Request, Response, StatusCode};
+use routerify::Body;
 use serde::{Deserialize, Serialize};

 use super::error::ApiError;
@@ -18,10 +20,14 @@ pub async fn json_request<T: for<'de> Deserialize<'de>>(
 pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
    request: &mut Request<Body>,
 ) -> Result<Option<T>, ApiError> {
-    let body = hyper::body::aggregate(request.body_mut())
+    let body = request
+        .body_mut()
+        .collect()
        .await
+        .map_err(|e| anyhow!(e))
        .context("Failed to read request body")
-        .map_err(ApiError::BadRequest)?;
+        .map_err(ApiError::BadRequest)?
+        .aggregate();
    if body.remaining() == 0 {
        return Ok(None);
    }
@@ -35,17 +41,24 @@ pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
        .map_err(ApiError::BadRequest)
 }

-pub fn json_response<T: Serialize>(
+pub fn json_response_body<T: Serialize>(
    status: StatusCode,
    data: T,
 ) -> Result<Response<Body>, ApiError> {
+    json_response(status, data).map(|r| r.map(Body::new))
+}
+
+pub fn json_response<T: Serialize>(
+    status: StatusCode,
+    data: T,
+) -> Result<Response<Full<Bytes>>, ApiError> {
    let json = serde_json::to_string(&data)
        .context("Failed to serialize JSON response")
        .map_err(ApiError::InternalServerError)?;
    let response = Response::builder()
        .status(status)
        .header(header::CONTENT_TYPE, "application/json")
-        .body(Body::from(json))
+        .body(Full::from(json))
        .map_err(|e| ApiError::InternalServerError(e.into()))?;
    Ok(response)
 }
--- a/libs/utils/src/http/mod.rs
+++ b/libs/utils/src/http/mod.rs
@@ -5,4 +5,4 @@ pub mod request;

 /// Current fast way to apply simple http routing in various Neon binaries.
 /// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed.
-pub use routerify::{ext::RequestExt, RouterBuilder, RouterService};
+pub use routerify::{ext::RequestExt, Body, RequestServiceBuilder, RouterBuilder};
--- a/libs/utils/src/http/request.rs
+++ b/libs/utils/src/http/request.rs
@@ -3,8 +3,9 @@ use std::{borrow::Cow, str::FromStr};

 use super::error::ApiError;
 use anyhow::anyhow;
-use hyper::{body::HttpBody, Body, Request};
-use routerify::ext::RequestExt;
+use http_body_util::BodyExt;
+use hyper::Request;
+use routerify::{ext::RequestExt, Body};

 pub fn get_request_param<'a>(
    request: &'a Request<Body>,
@@ -75,7 +76,7 @@ pub fn parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
 }

 pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
-    match request.body_mut().data().await {
+    match request.body_mut().frame().await {
        Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))),
        None => Ok(()),
    }
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -1,3 +1,4 @@
+use std::num::ParseIntError;
 use std::{fmt, str::FromStr};

 use anyhow::Context;
@@ -374,6 +375,13 @@ impl fmt::Display for NodeId {
    }
 }

+impl FromStr for NodeId {
+    type Err = ParseIntError;
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Ok(NodeId(u64::from_str(s)?))
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use serde_assert::{Deserializer, Serializer, Token, Tokens};
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,5 +1,5 @@
 use pageserver_api::{models::*, shard::TenantShardId};
-use reqwest::{IntoUrl, Method};
+use reqwest::{IntoUrl, Method, StatusCode};
 use utils::{
    http::error::HttpErrorBody,
    id::{TenantId, TimelineId},
@@ -22,14 +22,14 @@ pub enum Error {
    #[error("receive error body: {0}")]
    ReceiveErrorBody(String),

-    #[error("pageserver API: {0}")]
-    ApiError(String),
+    #[error("pageserver API: {1}")]
+    ApiError(StatusCode, String),
 }

 pub type Result<T> = std::result::Result<T, Error>;

-pub(crate) trait ResponseErrorMessageExt: Sized {
-    async fn error_from_body(self) -> Result<Self>;
+pub trait ResponseErrorMessageExt: Sized {
+    fn error_from_body(self) -> impl std::future::Future<Output = Result<Self>> + Send;
 }

 impl ResponseErrorMessageExt for reqwest::Response {
@@ -41,7 +41,7 @@ impl ResponseErrorMessageExt for reqwest::Response {

        let url = self.url().to_owned();
        Err(match self.json::<HttpErrorBody>().await {
-            Ok(HttpErrorBody { msg }) => Error::ApiError(msg),
+            Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
            Err(_) => {
                Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), url))
            }
@@ -71,9 +71,9 @@ impl Client {

    pub async fn tenant_details(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
    ) -> Result<pageserver_api::models::TenantDetails> {
-        let uri = format!("{}/v1/tenant/{tenant_id}", self.mgmt_api_endpoint);
+        let uri = format!("{}/v1/tenant/{tenant_shard_id}", self.mgmt_api_endpoint);
        self.get(uri)
            .await?
            .json()
@@ -83,9 +83,12 @@ impl Client {

    pub async fn list_timelines(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
    ) -> Result<Vec<pageserver_api::models::TimelineInfo>> {
-        let uri = format!("{}/v1/tenant/{tenant_id}/timeline", self.mgmt_api_endpoint);
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline",
+            self.mgmt_api_endpoint
+        );
        self.get(&uri)
            .await?
            .json()
@@ -179,23 +182,23 @@ impl Client {
            "{}/v1/tenant/{}/secondary/download",
            self.mgmt_api_endpoint, tenant_id
        );
-        self.request(Method::POST, &uri, ())
-            .await?
-            .error_for_status()
-            .map(|_| ())
-            .map_err(|e| Error::ApiError(format!("{}", e)))
+        self.request(Method::POST, &uri, ()).await?;
+        Ok(())
    }

    pub async fn location_config(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        config: LocationConfig,
        flush_ms: Option<std::time::Duration>,
    ) -> Result<()> {
-        let req_body = TenantLocationConfigRequest { tenant_id, config };
+        let req_body = TenantLocationConfigRequest {
+            tenant_id: tenant_shard_id,
+            config,
+        };
        let path = format!(
            "{}/v1/tenant/{}/location_config",
-            self.mgmt_api_endpoint, tenant_id
+            self.mgmt_api_endpoint, tenant_shard_id
        );
        let path = if let Some(flush_ms) = flush_ms {
            format!("{}?flush_ms={}", path, flush_ms.as_millis())
@@ -206,14 +209,23 @@ impl Client {
        Ok(())
    }

+    pub async fn list_location_config(&self) -> Result<LocationConfigListResponse> {
+        let path = format!("{}/v1/location_config", self.mgmt_api_endpoint);
+        self.request(Method::GET, &path, ())
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
    pub async fn timeline_create(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        req: &TimelineCreateRequest,
    ) -> Result<TimelineInfo> {
        let uri = format!(
            "{}/v1/tenant/{}/timeline",
-            self.mgmt_api_endpoint, tenant_id
+            self.mgmt_api_endpoint, tenant_shard_id
        );
        self.request(Method::POST, &uri, req)
            .await?
@@ -233,4 +245,34 @@ impl Client {
            .await
            .map_err(Error::ReceiveBody)
    }
+
+    pub async fn timeline_list(
+        &self,
+        tenant_shard_id: &TenantShardId,
+    ) -> Result<Vec<TimelineInfo>> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline",
+            self.mgmt_api_endpoint, tenant_shard_id
+        );
+        self.get(&uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn tenant_synthetic_size(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<TenantHistorySize> {
+        let uri = format!(
+            "{}/v1/tenant/{}/synthetic_size",
+            self.mgmt_api_endpoint, tenant_shard_id
+        );
+        self.get(&uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
 }
--- a/pageserver/client/src/mgmt_api/util.rs
+++ b/pageserver/client/src/mgmt_api/util.rs
@@ -2,6 +2,7 @@

 use std::sync::Arc;

+use pageserver_api::shard::TenantShardId;
 use tokio::task::JoinSet;
 use utils::id::{TenantId, TenantTimelineId};

@@ -31,7 +32,10 @@ pub async fn get_pageserver_tenant_timelines_unsharded(
            async move {
                (
                    tenant_id,
-                    mgmt_api_client.tenant_details(tenant_id).await.unwrap(),
+                    mgmt_api_client
+                        .tenant_details(TenantShardId::unsharded(tenant_id))
+                        .await
+                        .unwrap(),
                )
            }
        });
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -21,7 +21,6 @@ tracing.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true

-pageserver = { path = ".." }
 pageserver_client.workspace = true
 pageserver_api.workspace = true
 utils = { path = "../../libs/utils/" }
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -1,9 +1,7 @@
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use futures::future::join_all;
-use pageserver::pgdatadir_mapping::key_to_rel_block;
-use pageserver::repository;
-use pageserver_api::key::is_rel_block_key;
+use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::models::PagestreamGetPageRequest;

@@ -269,7 +267,7 @@ async fn main_impl(
                        let mut rng = rand::thread_rng();
                        let r = &all_ranges[weights.sample(&mut rng)];
                        let key: i128 = rng.gen_range(r.start..r.end);
-                        let key = repository::Key::from_i128(key);
+                        let key = Key::from_i128(key);
                        let (rel_tag, block_no) =
                            key_to_rel_block(key).expect("we filter non-rel-block keys out above");
                        (
@@ -319,7 +317,7 @@ async fn main_impl(
                                let mut rng = rand::thread_rng();
                                let r = &ranges[weights.sample(&mut rng)];
                                let key: i128 = rng.gen_range(r.start..r.end);
-                                let key = repository::Key::from_i128(key);
+                                let key = Key::from_i128(key);
                                assert!(is_rel_block_key(&key));
                                let (rel_tag, block_no) = key_to_rel_block(key)
                                    .expect("we filter non-rel-block keys out above");
@@ -351,10 +349,10 @@ async fn main_impl(

    let work_sender_task = tokio::spawn(work_sender);

+    info!("waiting for everything to become ready");
+    start_work_barrier.wait().await;
+    info!("work started");
    if let Some(runtime) = args.runtime {
-        info!("waiting for everything to become ready");
-        start_work_barrier.wait().await;
-        info!("work started");
        tokio::time::sleep(runtime.into()).await;
        info!("runtime over, signalling cancellation");
        cancel.cancel();
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -527,6 +527,7 @@ fn start_pageserver(
            conf,
            remote_storage.clone(),
            disk_usage_eviction_state.clone(),
+            tenant_manager.clone(),
            background_jobs_barrier.clone(),
        )?;
    }
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -1126,11 +1126,12 @@ mod tests {
    };

    use camino_tempfile::{tempdir, Utf8TempDir};
+    use pageserver_api::models::EvictionPolicy;
    use remote_storage::{RemoteStorageKind, S3Config};
    use utils::serde_percent::Percent;

    use super::*;
-    use crate::{tenant::config::EvictionPolicy, DEFAULT_PG_VERSION};
+    use crate::DEFAULT_PG_VERSION;

    const ALL_BASE_VALUES_TOML: &str = r#"
 # Initial configuration file created by 'pageserver --init'
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -267,7 +267,7 @@ async fn calculate_synthetic_size_worker(
            }
        };

-        for (tenant_shard_id, tenant_state) in tenants {
+        for (tenant_shard_id, tenant_state, _gen) in tenants {
            if tenant_state != TenantState::Active {
                continue;
            }
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -196,7 +196,7 @@ pub(super) async fn collect_all_metrics(
        }
    };

-    let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
+    let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move {
        if state != TenantState::Active || !id.is_zero() {
            None
        } else {
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -47,21 +47,24 @@ use std::{
 };

 use anyhow::Context;
-use camino::Utf8Path;
+use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
 use serde::{Deserialize, Serialize};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn, Instrument};
-use utils::completion;
 use utils::serde_percent::Percent;
+use utils::{completion, id::TimelineId};

 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        self,
-        storage_layer::{AsLayerDesc, EvictionError, Layer},
+        mgr::TenantManager,
+        remote_timeline_client::LayerFileMetadata,
+        secondary::SecondaryTenant,
+        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
        Timeline,
    },
 };
@@ -125,6 +128,7 @@ pub fn launch_disk_usage_global_eviction_task(
    conf: &'static PageServerConf,
    storage: GenericRemoteStorage,
    state: Arc<State>,
+    tenant_manager: Arc<TenantManager>,
    background_jobs_barrier: completion::Barrier,
 ) -> anyhow::Result<()> {
    let Some(task_config) = &conf.disk_usage_based_eviction else {
@@ -150,8 +154,7 @@ pub fn launch_disk_usage_global_eviction_task(
                _ = background_jobs_barrier.wait() => { }
            };

-            disk_usage_eviction_task(&state, task_config, &storage, &conf.tenants_path(), cancel)
-                .await;
+            disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await;
            Ok(())
        },
    );
@@ -164,7 +167,7 @@ async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
    storage: &GenericRemoteStorage,
-    tenants_dir: &Utf8Path,
+    tenant_manager: Arc<TenantManager>,
    cancel: CancellationToken,
 ) {
    scopeguard::defer! {
@@ -191,7 +194,7 @@ async fn disk_usage_eviction_task(
                state,
                task_config,
                storage,
-                tenants_dir,
+                &tenant_manager,
                &cancel,
            )
            .await;
@@ -226,15 +229,17 @@ async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
    storage: &GenericRemoteStorage,
-    tenants_dir: &Utf8Path,
+    tenant_manager: &Arc<TenantManager>,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
-    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
+    let tenants_dir = tenant_manager.get_conf().tenants_path();
+    let usage_pre = filesystem_level_usage::get(&tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
    let res = disk_usage_eviction_task_iteration_impl(
        state,
        storage,
        usage_pre,
+        tenant_manager,
        task_config.eviction_order,
        cancel,
    )
@@ -248,7 +253,7 @@ async fn disk_usage_eviction_task_iteration(
                }
                IterationOutcome::Finished(outcome) => {
                    // Verify with statvfs whether we made any real progress
-                    let after = filesystem_level_usage::get(tenants_dir, task_config)
+                    let after = filesystem_level_usage::get(&tenants_dir, task_config)
                        // It's quite unlikely to hit the error here. Keep the code simple and bail out.
                        .context("get filesystem-level disk usage after evictions")?;

@@ -324,6 +329,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
    _storage: &GenericRemoteStorage,
    usage_pre: U,
+    tenant_manager: &Arc<TenantManager>,
    eviction_order: EvictionOrder,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
@@ -344,29 +350,29 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
        "running disk usage based eviction due to pressure"
    );

-    let candidates = match collect_eviction_candidates(eviction_order, cancel).await? {
-        EvictionCandidates::Cancelled => {
-            return Ok(IterationOutcome::Cancelled);
-        }
-        EvictionCandidates::Finished(partitioned) => partitioned,
-    };
+    let candidates =
+        match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? {
+            EvictionCandidates::Cancelled => {
+                return Ok(IterationOutcome::Cancelled);
+            }
+            EvictionCandidates::Finished(partitioned) => partitioned,
+        };

    // Debug-log the list of candidates
    let now = SystemTime::now();
    for (i, (partition, candidate)) in candidates.iter().enumerate() {
        let nth = i + 1;
-        let desc = candidate.layer.layer_desc();
        let total_candidates = candidates.len();
-        let size = desc.file_size;
+        let size = candidate.layer.get_file_size();
        let rel = candidate.relative_last_activity;
        debug!(
            "cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}",
            now.duration_since(candidate.last_activity_ts)
                .unwrap()
                .as_micros(),
-            desc.tenant_shard_id,
-            desc.timeline_id,
-            candidate.layer,
+            candidate.layer.get_tenant_shard_id(),
+            candidate.layer.get_timeline_id(),
+            candidate.layer.get_name(),
        );
    }

@@ -398,7 +404,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            warned = Some(usage_planned);
        }

-        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
+        usage_planned.add_available_bytes(candidate.layer.get_file_size());
        evicted_amount += 1;
    }

@@ -463,19 +469,30 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                continue;
            };

-            js.spawn(async move {
-                let rtc = candidate.timeline.remote_client.as_ref().expect(
-                    "holding the witness, all timelines must have a remote timeline client",
-                );
-                let file_size = candidate.layer.layer_desc().file_size;
-                candidate
-                    .layer
-                    .evict_and_wait(rtc)
-                    .await
-                    .map(|()| file_size)
-                    .map_err(|e| (file_size, e))
-            });
+            match candidate.layer {
+                EvictionLayer::Attached(layer) => {
+                    let file_size = layer.layer_desc().file_size;
+                    js.spawn(async move {
+                        layer
+                            .evict_and_wait()
+                            .await
+                            .map(|()| file_size)
+                            .map_err(|e| (file_size, e))
+                    });
+                }
+                EvictionLayer::Secondary(layer) => {
+                    let file_size = layer.metadata.file_size();
+                    let tenant_manager = tenant_manager.clone();

+                    js.spawn(async move {
+                        layer
+                            .secondary_tenant
+                            .evict_layer(tenant_manager.get_conf(), layer.timeline_id, layer.name)
+                            .await;
+                        Ok(file_size)
+                    });
+                }
+            }
            tokio::task::yield_now().await;
        }

@@ -502,11 +519,100 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 }

 #[derive(Clone)]
-struct EvictionCandidate {
-    timeline: Arc<Timeline>,
-    layer: Layer,
-    last_activity_ts: SystemTime,
-    relative_last_activity: finite_f32::FiniteF32,
+pub(crate) struct EvictionSecondaryLayer {
+    pub(crate) secondary_tenant: Arc<SecondaryTenant>,
+    pub(crate) timeline_id: TimelineId,
+    pub(crate) name: LayerFileName,
+    pub(crate) metadata: LayerFileMetadata,
+}
+
+/// Full [`Layer`] objects are specific to tenants in attached mode.  This type is a layer
+/// of indirection to store either a `Layer`, or a reference to a secondary tenant and a layer name.
+#[derive(Clone)]
+pub(crate) enum EvictionLayer {
+    Attached(Layer),
+    #[allow(dead_code)]
+    Secondary(EvictionSecondaryLayer),
+}
+
+impl From<Layer> for EvictionLayer {
+    fn from(value: Layer) -> Self {
+        Self::Attached(value)
+    }
+}
+
+impl EvictionLayer {
+    pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
+        match self {
+            Self::Attached(l) => &l.layer_desc().tenant_shard_id,
+            Self::Secondary(sl) => sl.secondary_tenant.get_tenant_shard_id(),
+        }
+    }
+
+    pub(crate) fn get_timeline_id(&self) -> &TimelineId {
+        match self {
+            Self::Attached(l) => &l.layer_desc().timeline_id,
+            Self::Secondary(sl) => &sl.timeline_id,
+        }
+    }
+
+    pub(crate) fn get_name(&self) -> LayerFileName {
+        match self {
+            Self::Attached(l) => l.layer_desc().filename(),
+            Self::Secondary(sl) => sl.name.clone(),
+        }
+    }
+
+    pub(crate) fn get_file_size(&self) -> u64 {
+        match self {
+            Self::Attached(l) => l.layer_desc().file_size,
+            Self::Secondary(sl) => sl.metadata.file_size(),
+        }
+    }
+}
+
+#[derive(Clone)]
+pub(crate) struct EvictionCandidate {
+    pub(crate) layer: EvictionLayer,
+    pub(crate) last_activity_ts: SystemTime,
+    pub(crate) relative_last_activity: finite_f32::FiniteF32,
+}
+
+impl std::fmt::Display for EvictionLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match self {
+            Self::Attached(l) => l.fmt(f),
+            Self::Secondary(sl) => {
+                write!(f, "{}/{}", sl.timeline_id, sl.name)
+            }
+        }
+    }
+}
+
+pub(crate) struct DiskUsageEvictionInfo {
+    /// Timeline's largest layer (remote or resident)
+    pub max_layer_size: Option<u64>,
+    /// Timeline's resident layers
+    pub resident_layers: Vec<EvictionCandidate>,
+}
+
+impl std::fmt::Debug for EvictionCandidate {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it
+        // having to allocate a string to this is bad, but it will rarely be formatted
+        let ts = chrono::DateTime::<chrono::Utc>::from(self.last_activity_ts);
+        let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true);
+        struct DisplayIsDebug<'a, T>(&'a T);
+        impl<'a, T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'a, T> {
+            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                write!(f, "{}", self.0)
+            }
+        }
+        f.debug_struct("LocalLayerInfoForDiskUsageEviction")
+            .field("layer", &DisplayIsDebug(&self.layer))
+            .field("last_activity", &ts)
+            .finish()
+    }
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
@@ -623,6 +729,7 @@ enum EvictionCandidates {
 /// - tenant B 1 layer
 /// - tenant C 8 layers
 async fn collect_eviction_candidates(
+    tenant_manager: &Arc<TenantManager>,
    eviction_order: EvictionOrder,
    cancel: &CancellationToken,
 ) -> anyhow::Result<EvictionCandidates> {
@@ -631,13 +738,16 @@ async fn collect_eviction_candidates(
        .await
        .context("get list of tenants")?;

+    // TODO: avoid listing every layer in every tenant: this loop can block the executor,
+    // and the resulting data structure can be huge.
+    // (https://github.com/neondatabase/neon/issues/6224)
    let mut candidates = Vec::new();

-    for (tenant_id, _state) in &tenants {
+    for (tenant_id, _state, _gen) in tenants {
        if cancel.is_cancelled() {
            return Ok(EvictionCandidates::Cancelled);
        }
-        let tenant = match tenant::mgr::get_tenant(*tenant_id, true) {
+        let tenant = match tenant::mgr::get_tenant(tenant_id, true) {
            Ok(tenant) => tenant,
            Err(e) => {
                // this can happen if tenant has lifecycle transition after we fetched it
@@ -665,11 +775,7 @@ async fn collect_eviction_candidates(
            }
            let info = tl.get_local_layers_for_disk_usage_eviction().await;
            debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
-            tenant_candidates.extend(
-                info.resident_layers
-                    .into_iter()
-                    .map(|layer_infos| (tl.clone(), layer_infos)),
-            );
+            tenant_candidates.extend(info.resident_layers.into_iter());
            max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));

            if cancel.is_cancelled() {
@@ -690,14 +796,16 @@ async fn collect_eviction_candidates(
        // A default override can be put in the default tenant conf in the pageserver.toml.
        let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
            debug!(
-                tenant_id=%tenant.tenant_id(),
+                tenant_id=%tenant.tenant_shard_id().tenant_id,
+                shard_id=%tenant.tenant_shard_id().shard_slug(),
                overridden_size=s,
                "using overridden min resident size for tenant"
            );
            s
        } else {
            debug!(
-                tenant_id=%tenant.tenant_id(),
+                tenant_id=%tenant.tenant_shard_id().tenant_id,
+                shard_id=%tenant.tenant_shard_id().shard_slug(),
                max_layer_size,
                "using max layer size as min_resident_size for tenant",
            );
@@ -707,7 +815,7 @@ async fn collect_eviction_candidates(
        // Sort layers most-recently-used first, then partition by
        // cumsum above/below min_resident_size.
        tenant_candidates
-            .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
+            .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
        let mut cumsum: i128 = 0;

        // keeping the -1 or not decides if every tenant should lose their least recently accessed
@@ -741,12 +849,10 @@ async fn collect_eviction_candidates(
            .unwrap_or(1);
        let divider = total as f32;

-        for (i, (timeline, layer_info)) in tenant_candidates.into_iter().enumerate() {
-            let file_size = layer_info.file_size();
-
+        for (i, mut candidate) in tenant_candidates.into_iter().enumerate() {
            // as we iterate this reverse sorted list, the most recently accessed layer will always
            // be 1.0; this is for us to evict it last.
-            let relative_last_activity = if matches!(
+            candidate.relative_last_activity = if matches!(
                eviction_order,
                EvictionOrder::RelativeAccessed { .. }
            ) {
@@ -761,22 +867,46 @@ async fn collect_eviction_candidates(
                finite_f32::FiniteF32::ZERO
            };

-            let candidate = EvictionCandidate {
-                timeline,
-                last_activity_ts: layer_info.last_activity_ts,
-                layer: layer_info.layer,
-                relative_last_activity,
-            };
            let partition = if cumsum > min_resident_size as i128 {
                MinResidentSizePartition::Above
            } else {
                MinResidentSizePartition::Below
            };
+            cumsum += i128::from(candidate.layer.get_file_size());
            candidates.push((partition, candidate));
-            cumsum += i128::from(file_size);
        }
    }

+    // Note: the same tenant ID might be hit twice, if it transitions from attached to
+    // secondary while we run.  That is okay: when we eventually try and run the eviction,
+    // the `Gate` on the object will ensure that whichever one has already been shut down
+    // will not delete anything.
+
+    let mut secondary_tenants = Vec::new();
+    tenant_manager.foreach_secondary_tenants(
+        |_tenant_shard_id: &TenantShardId, state: &Arc<SecondaryTenant>| {
+            secondary_tenants.push(state.clone());
+        },
+    );
+
+    for secondary_tenant in secondary_tenants {
+        let mut layer_info = secondary_tenant.get_layers_for_eviction();
+
+        layer_info
+            .resident_layers
+            .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
+
+        candidates.extend(layer_info.resident_layers.into_iter().map(|candidate| {
+            (
+                // Secondary locations' layers are always considered above the min resident size,
+                // i.e. secondary locations are permitted to be trimmed to zero layers if all
+                // the layers have sufficiently old access times.
+                MinResidentSizePartition::Above,
+                candidate,
+            )
+        }));
+    }
+
    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
        "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");

@@ -821,7 +951,7 @@ impl std::ops::Deref for TimelineKey {
 }

 /// A totally ordered f32 subset we can use with sorting functions.
-mod finite_f32 {
+pub(crate) mod finite_f32 {

    /// A totally ordered f32 subset we can use with sorting functions.
    #[derive(Clone, Copy, PartialEq)]
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -12,8 +12,10 @@ use futures::TryFutureExt;
 use humantime::format_rfc3339;
 use hyper::header;
 use hyper::StatusCode;
-use hyper::{Body, Request, Response, Uri};
+use hyper::{Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
+use pageserver_api::models::LocationConfigListResponse;
+use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::{
@@ -30,6 +32,7 @@ use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
+use utils::http::Body;

 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
@@ -38,11 +41,11 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::GetActiveTenantError;
-use crate::tenant::mgr::UpsertLocationError;
 use crate::tenant::mgr::{
    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
    TenantSlotError, TenantSlotUpsertError, TenantStateError,
 };
+use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -62,7 +65,7 @@ use utils::{
    http::{
        endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
        error::{ApiError, HttpErrorBody},
-        json::{json_request, json_response},
+        json::{json_request, json_response_body as json_response},
        request::parse_request_param,
        RequestExt, RouterBuilder,
    },
@@ -265,7 +268,7 @@ impl From<SetNewTenantConfigError> for ApiError {
            SetNewTenantConfigError::GetTenant(tid) => {
                ApiError::NotFound(anyhow!("tenant {}", tid).into())
            }
-            e @ SetNewTenantConfigError::Persist(_) => {
+            e @ (SetNewTenantConfigError::Persist(_) | SetNewTenantConfigError::Other(_)) => {
                ApiError::InternalServerError(anyhow::Error::new(e))
            }
        }
@@ -704,7 +707,9 @@ async fn tenant_attach_handler(
    }

    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-    let location_conf = LocationConf::attached_single(tenant_conf, generation);
+    let shard_params = ShardParameters::default();
+    let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params);
+
    let tenant = state
        .tenant_manager
        .upsert_location(
@@ -874,11 +879,12 @@ async fn tenant_list_handler(
            ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
        })?
        .iter()
-        .map(|(id, state)| TenantInfo {
+        .map(|(id, state, gen)| TenantInfo {
            id: *id,
            state: state.clone(),
            current_physical_size: None,
            attachment_status: state.attachment_status(),
+            generation: (*gen).into(),
        })
        .collect::<Vec<TenantInfo>>();

@@ -908,6 +914,7 @@ async fn tenant_status(
                state: state.clone(),
                current_physical_size: Some(current_physical_size),
                attachment_status: state.attachment_status(),
+                generation: tenant.generation().into(),
            },
            timelines: tenant.list_timeline_ids(),
        })
@@ -1192,7 +1199,8 @@ async fn tenant_create_handler(

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

-    let location_conf = LocationConf::attached_single(tenant_conf, generation);
+    let location_conf =
+        LocationConf::attached_single(tenant_conf, generation, &request_data.shard_parameters);

    let new_tenant = state
        .tenant_manager
@@ -1211,7 +1219,6 @@ async fn tenant_create_handler(
            "Upsert succeeded but didn't return tenant!"
        )));
    };
-
    // We created the tenant. Existing API semantics are that the tenant
    // is Active when this function returns.
    if let res @ Err(_) = new_tenant
@@ -1230,7 +1237,7 @@ async fn tenant_create_handler(

    json_response(
        StatusCode::CREATED,
-        TenantCreateResponse(new_tenant.tenant_id()),
+        TenantCreateResponse(new_tenant.tenant_shard_id().tenant_id),
    )
 }

@@ -1349,6 +1356,28 @@ async fn put_tenant_location_config_handler(
    json_response(StatusCode::OK, ())
 }

+async fn list_location_config_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&request);
+    let slots = state.tenant_manager.list();
+    let result = LocationConfigListResponse {
+        tenant_shards: slots
+            .into_iter()
+            .map(|(tenant_shard_id, slot)| {
+                let v = match slot {
+                    TenantSlot::Attached(t) => Some(t.get_location_conf()),
+                    TenantSlot::Secondary(s) => Some(s.get_location_conf()),
+                    TenantSlot::InProgress(_) => None,
+                };
+                (tenant_shard_id, v)
+            })
+            .collect(),
+    };
+    json_response(StatusCode::OK, result)
+}
+
 /// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
 async fn handle_tenant_break(
    r: Request<Body>,
@@ -1543,7 +1572,7 @@ async fn getpage_at_lsn_handler(
            Response::builder()
                .status(StatusCode::OK)
                .header(header::CONTENT_TYPE, "application/octet-stream")
-                .body(hyper::Body::from(page))
+                .body(Body::from(page))
                .unwrap(),
        )
    }
@@ -1650,12 +1679,13 @@ async fn disk_usage_eviction_run(
        )));
    };

-    let state = state.disk_usage_eviction_state.clone();
+    let eviction_state = state.disk_usage_eviction_state.clone();

    let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
-        &state,
+        &eviction_state,
        storage,
        usage,
+        &state.tenant_manager,
        config.eviction_order,
        &cancel,
    )
@@ -1839,7 +1869,7 @@ pub fn make_router(
    state: Arc<State>,
    launch_ts: &'static LaunchTimestamp,
    auth: Option<Arc<SwappableJwtAuth>>,
-) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
+) -> anyhow::Result<RouterBuilder<Body, ApiError>> {
    let spec = include_bytes!("openapi_spec.yml");
    let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
    if auth.is_some() {
@@ -1890,6 +1920,9 @@ pub fn make_router(
        .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
            api_handler(r, put_tenant_location_config_handler)
        })
+        .get("/v1/location_config", |r| {
+            api_handler(r, list_location_config_handler)
+        })
        .get("/v1/tenant/:tenant_shard_id/timeline", |r| {
            api_handler(r, timeline_list_handler)
        })
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -11,7 +11,7 @@ use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
 use strum::{EnumCount, IntoEnumIterator, VariantNames};
 use strum_macros::{EnumVariantNames, IntoStaticStr};
-use utils::id::{TenantId, TimelineId};
+use utils::id::TimelineId;

 /// Prometheus histogram buckets (in seconds) for operations in the critical
 /// path. In other words, operations that directly affect that latency of user
@@ -59,7 +59,7 @@ pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(||
    register_counter_vec!(
        "pageserver_storage_operations_seconds_sum",
        "Total time spent on storage operations with operation, tenant and timeline dimensions",
-        &["operation", "tenant_id", "timeline_id"],
+        &["operation", "tenant_id", "shard_id", "timeline_id"],
    )
    .expect("failed to define a metric")
 });
@@ -68,7 +68,7 @@ pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::n
    register_int_counter_vec!(
        "pageserver_storage_operations_seconds_count",
        "Count of storage operations with operation, tenant and timeline dimensions",
-        &["operation", "tenant_id", "timeline_id"],
+        &["operation", "tenant_id", "shard_id", "timeline_id"],
    )
    .expect("failed to define a metric")
 });
@@ -337,15 +337,6 @@ pub(crate) mod page_cache_eviction_metrics {
    }
 }

-pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "pageserver_page_cache_acquire_pinned_slot_seconds",
-        "Time spent acquiring a pinned slot in the page cache",
-        CRITICAL_OP_BUCKETS.into(),
-    )
-    .expect("failed to define a metric")
-});
-
 static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "page_cache_errors_total",
@@ -382,7 +373,7 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_last_record_lsn",
        "Last record LSN grouped by timeline",
-        &["tenant_id", "timeline_id"]
+        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -391,7 +382,7 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_resident_physical_size",
        "The size of the layer files present in the pageserver's filesystem.",
-        &["tenant_id", "timeline_id"]
+        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -409,7 +400,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
        "pageserver_remote_physical_size",
        "The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
        // Corollary: If any files are missing from the index part, they won't be included here.
-        &["tenant_id", "timeline_id"]
+        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -442,7 +433,7 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_current_logical_size",
        "Current logical size grouped by timeline",
-        &["tenant_id", "timeline_id"]
+        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define current logical size metric")
 });
@@ -591,7 +582,7 @@ pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_broken_tenants_count",
        "Set of broken tenants",
-        &["tenant_id"]
+        &["tenant_id", "shard_id"]
    )
    .expect("Failed to register pageserver_tenant_states_count metric")
 });
@@ -611,7 +602,7 @@ static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_created_persistent_files_total",
        "Number of files created that are meant to be uploaded to cloud storage",
-        &["tenant_id", "timeline_id"]
+        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -620,7 +611,7 @@ static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_written_persistent_bytes_total",
        "Total bytes written that are meant to be uploaded to cloud storage",
-        &["tenant_id", "timeline_id"]
+        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -639,7 +630,7 @@ static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_evictions",
        "Number of layers evicted from the pageserver",
-        &["tenant_id", "timeline_id"]
+        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -936,7 +927,7 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_io_operations_bytes_total",
        "Total amount of bytes read/written in IO operations",
-        &["operation", "tenant_id", "timeline_id"]
+        &["operation", "tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -1011,7 +1002,7 @@ static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds",
        "Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
-        &["smgr_query_type", "tenant_id", "timeline_id"],
+        &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
@@ -1078,8 +1069,9 @@ static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
 });

 impl SmgrQueryTimePerTimeline {
-    pub(crate) fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
-        let tenant_id = tenant_id.to_string();
+    pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
+        let tenant_id = tenant_shard_id.tenant_id.to_string();
+        let shard_slug = format!("{}", tenant_shard_id.shard_slug());
        let timeline_id = timeline_id.to_string();
        let metrics = std::array::from_fn(|i| {
            let op = SmgrQueryType::from_repr(i).unwrap();
@@ -1087,7 +1079,7 @@ impl SmgrQueryTimePerTimeline {
                .get_metric_with_label_values(&[op.into()])
                .unwrap();
            let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
-                .get_metric_with_label_values(&[op.into(), &tenant_id, &timeline_id])
+                .get_metric_with_label_values(&[op.into(), &tenant_id, &shard_slug, &timeline_id])
                .unwrap();
            GlobalAndPerTimelineHistogram {
                global,
@@ -1107,6 +1099,7 @@ impl SmgrQueryTimePerTimeline {

 #[cfg(test)]
 mod smgr_query_time_tests {
+    use pageserver_api::shard::TenantShardId;
    use strum::IntoEnumIterator;
    use utils::id::{TenantId, TimelineId};

@@ -1133,7 +1126,10 @@ mod smgr_query_time_tests {
        for op in &ops {
            let tenant_id = TenantId::generate();
            let timeline_id = TimelineId::generate();
-            let metrics = super::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
+            let metrics = super::SmgrQueryTimePerTimeline::new(
+                &TenantShardId::unsharded(tenant_id),
+                &timeline_id,
+            );

            let get_counts = || {
                let global: u64 = ops
@@ -1214,7 +1210,13 @@ static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::
        "Number of ongoing calls to remote timeline client. \
         Used to populate pageserver_remote_timeline_client_calls_started. \
         This metric is not useful for sampling from Prometheus, but useful in tests.",
-        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+        &[
+            "tenant_id",
+            "shard_id",
+            "timeline_id",
+            "file_kind",
+            "op_kind"
+        ],
    )
    .expect("failed to define a metric")
 });
@@ -1235,22 +1237,23 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
    .expect("failed to define a metric")
 });

-static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
+static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> =
+    Lazy::new(|| {
+        register_int_counter_vec!(
        "pageserver_remote_timeline_client_bytes_started",
        "Incremented by the number of bytes associated with a remote timeline client operation. \
         The increment happens when the operation is scheduled.",
-        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+        &["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"],
    )
-    .expect("failed to define a metric")
-});
+        .expect("failed to define a metric")
+    });

 static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_remote_timeline_client_bytes_finished",
        "Incremented by the number of bytes associated with a remote timeline client operation. \
         The increment happens when the operation finishes (regardless of success/failure/shutdown).",
-        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+        &["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"],
    )
    .expect("failed to define a metric")
 });
@@ -1696,14 +1699,19 @@ pub(crate) struct StorageTimeMetrics {
 }

 impl StorageTimeMetrics {
-    pub fn new(operation: StorageTimeOperation, tenant_id: &str, timeline_id: &str) -> Self {
+    pub fn new(
+        operation: StorageTimeOperation,
+        tenant_id: &str,
+        shard_id: &str,
+        timeline_id: &str,
+    ) -> Self {
        let operation: &'static str = operation.into();

        let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE
-            .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
+            .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
            .unwrap();
        let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
-            .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
+            .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
            .unwrap();
        let global_histogram = STORAGE_TIME_GLOBAL
            .get_metric_with_label_values(&[operation])
@@ -1755,40 +1763,66 @@ impl TimelineMetrics {
        let tenant_id = tenant_shard_id.tenant_id.to_string();
        let shard_id = format!("{}", tenant_shard_id.shard_slug());
        let timeline_id = timeline_id.to_string();
-        let flush_time_histo =
-            StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
-        let compact_time_histo =
-            StorageTimeMetrics::new(StorageTimeOperation::Compact, &tenant_id, &timeline_id);
-        let create_images_time_histo =
-            StorageTimeMetrics::new(StorageTimeOperation::CreateImages, &tenant_id, &timeline_id);
-        let logical_size_histo =
-            StorageTimeMetrics::new(StorageTimeOperation::LogicalSize, &tenant_id, &timeline_id);
+        let flush_time_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::LayerFlush,
+            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
+        let compact_time_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::Compact,
+            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
+        let create_images_time_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::CreateImages,
+            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
+        let logical_size_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::LogicalSize,
+            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
        let imitate_logical_size_histo = StorageTimeMetrics::new(
            StorageTimeOperation::ImitateLogicalSize,
            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
+        let load_layer_map_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::LoadLayerMap,
+            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
+        let garbage_collect_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::Gc,
+            &tenant_id,
+            &shard_id,
            &timeline_id,
        );
-        let load_layer_map_histo =
-            StorageTimeMetrics::new(StorageTimeOperation::LoadLayerMap, &tenant_id, &timeline_id);
-        let garbage_collect_histo =
-            StorageTimeMetrics::new(StorageTimeOperation::Gc, &tenant_id, &timeline_id);
        let last_record_gauge = LAST_RECORD_LSN
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
+        // TODO: we shouldn't expose this metric
        let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
        let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
        let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
        let evictions = EVICTIONS
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
        let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
            .build(&tenant_id, &shard_id, &timeline_id);
@@ -1842,15 +1876,17 @@ impl Drop for TimelineMetrics {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
-        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
        {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
-            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+            let _ =
+                RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
        }
-        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ =
+            NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]);

        self.evictions_with_low_residence_duration
            .write()
@@ -1863,29 +1899,42 @@ impl Drop for TimelineMetrics {
        // outlive an individual smgr connection, but not the timeline.

        for op in StorageTimeOperation::VARIANTS {
-            let _ =
-                STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
-            let _ =
-                STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
+            let _ = STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[
+                op,
+                tenant_id,
+                shard_id,
+                timeline_id,
+            ]);
+            let _ = STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[
+                op,
+                tenant_id,
+                shard_id,
+                timeline_id,
+            ]);
        }

        for op in STORAGE_IO_SIZE_OPERATIONS {
-            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
+            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
        }

        for op in SmgrQueryType::iter() {
            let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
                op.into(),
                tenant_id,
+                shard_id,
                timeline_id,
            ]);
        }
    }
 }

-pub fn remove_tenant_metrics(tenant_id: &TenantId) {
-    let tid = tenant_id.to_string();
-    let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
+pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
+    // Only shard zero deals in synthetic sizes
+    if tenant_shard_id.is_zero() {
+        let tid = tenant_shard_id.tenant_id.to_string();
+        let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
+    }
+
    // we leave the BROKEN_TENANTS_SET entry if any
 }

@@ -1935,6 +1984,7 @@ impl Drop for PerTimelineRemotePhysicalSizeGauge {

 pub(crate) struct RemoteTimelineClientMetrics {
    tenant_id: String,
+    shard_id: String,
    timeline_id: String,
    remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
@@ -1946,6 +1996,7 @@ impl RemoteTimelineClientMetrics {
    pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
        RemoteTimelineClientMetrics {
            tenant_id: tenant_shard_id.tenant_id.to_string(),
+            shard_id: format!("{}", tenant_shard_id.shard_slug()),
            timeline_id: timeline_id.to_string(),
            calls_unfinished_gauge: Mutex::new(HashMap::default()),
            bytes_started_counter: Mutex::new(HashMap::default()),
@@ -1960,8 +2011,9 @@ impl RemoteTimelineClientMetrics {
            PerTimelineRemotePhysicalSizeGauge::new(
                REMOTE_PHYSICAL_SIZE
                    .get_metric_with_label_values(&[
-                        &self.tenant_id.to_string(),
-                        &self.timeline_id.to_string(),
+                        &self.tenant_id,
+                        &self.shard_id,
+                        &self.timeline_id,
                    ])
                    .unwrap(),
            )
@@ -1996,8 +2048,9 @@ impl RemoteTimelineClientMetrics {
        let metric = guard.entry(key).or_insert_with(move || {
            REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
                .get_metric_with_label_values(&[
-                    &self.tenant_id.to_string(),
-                    &self.timeline_id.to_string(),
+                    &self.tenant_id,
+                    &self.shard_id,
+                    &self.timeline_id,
                    key.0,
                    key.1,
                ])
@@ -2027,8 +2080,9 @@ impl RemoteTimelineClientMetrics {
        let metric = guard.entry(key).or_insert_with(move || {
            REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER
                .get_metric_with_label_values(&[
-                    &self.tenant_id.to_string(),
-                    &self.timeline_id.to_string(),
+                    &self.tenant_id,
+                    &self.shard_id,
+                    &self.timeline_id,
                    key.0,
                    key.1,
                ])
@@ -2047,8 +2101,9 @@ impl RemoteTimelineClientMetrics {
        let metric = guard.entry(key).or_insert_with(move || {
            REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER
                .get_metric_with_label_values(&[
-                    &self.tenant_id.to_string(),
-                    &self.timeline_id.to_string(),
+                    &self.tenant_id,
+                    &self.shard_id,
+                    &self.timeline_id,
                    key.0,
                    key.1,
                ])
@@ -2192,6 +2247,7 @@ impl Drop for RemoteTimelineClientMetrics {
    fn drop(&mut self) {
        let RemoteTimelineClientMetrics {
            tenant_id,
+            shard_id,
            timeline_id,
            remote_physical_size_gauge,
            calls_unfinished_gauge,
@@ -2201,6 +2257,7 @@ impl Drop for RemoteTimelineClientMetrics {
        for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
                tenant_id,
+                shard_id,
                timeline_id,
                a,
                b,
@@ -2209,6 +2266,7 @@ impl Drop for RemoteTimelineClientMetrics {
        for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
                tenant_id,
+                shard_id,
                timeline_id,
                a,
                b,
@@ -2217,6 +2275,7 @@ impl Drop for RemoteTimelineClientMetrics {
        for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[
                tenant_id,
+                shard_id,
                timeline_id,
                a,
                b,
@@ -2224,7 +2283,7 @@ impl Drop for RemoteTimelineClientMetrics {
        }
        {
            let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
-            let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+            let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        }
    }
 }
@@ -2234,8 +2293,6 @@ impl Drop for RemoteTimelineClientMetrics {
 pub(crate) trait MeasureRemoteOp: Sized {
    fn measure_remote_op(
        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
        file_kind: RemoteOpFileKind,
        op: RemoteOpKind,
        metrics: Arc<RemoteTimelineClientMetrics>,
@@ -2243,8 +2300,6 @@ pub(crate) trait MeasureRemoteOp: Sized {
        let start = Instant::now();
        MeasuredRemoteOp {
            inner: self,
-            tenant_id,
-            timeline_id,
            file_kind,
            op,
            start,
@@ -2260,8 +2315,6 @@ pin_project! {
    {
        #[pin]
        inner: F,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
        file_kind: RemoteOpFileKind,
        op: RemoteOpKind,
        start: Instant,
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -550,7 +550,6 @@ impl PageCache {
    // not require changes.

    async fn try_get_pinned_slot_permit(&self) -> anyhow::Result<PinnedSlotsPermit> {
-        let timer = crate::metrics::PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME.start_timer();
        match tokio::time::timeout(
            // Choose small timeout, neon_smgr does its own retries.
            // https://neondb.slack.com/archives/C04DGM6SMTM/p1694786876476869
@@ -563,7 +562,6 @@ impl PageCache {
                res.expect("this semaphore is never closed"),
            )),
            Err(_timeout) => {
-                timer.stop_and_discard();
                crate::metrics::page_cache_errors_inc(
                    crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
                );
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -13,7 +13,10 @@ use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use bytes::Bytes;
+use futures::stream::FuturesUnordered;
 use futures::Stream;
+use futures::StreamExt;
+use pageserver_api::key::Key;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::{
    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
@@ -21,11 +24,14 @@ use pageserver_api::models::{
    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
    PagestreamNblocksRequest, PagestreamNblocksResponse,
 };
+use pageserver_api::shard::ShardIndex;
+use pageserver_api::shard::{ShardCount, ShardNumber};
 use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
 use std::borrow::Cow;
+use std::collections::HashMap;
 use std::io;
 use std::net::TcpListener;
 use std::pin::pin;
@@ -40,6 +46,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
+use utils::sync::gate::GateGuard;
 use utils::{
    auth::{Claims, Scope, SwappableJwtAuth},
    id::{TenantId, TimelineId},
@@ -274,6 +281,13 @@ async fn page_service_conn_main(
    }
 }

+/// While a handler holds a reference to a Timeline, it also holds a the
+/// timeline's Gate open.
+struct HandlerTimeline {
+    timeline: Arc<Timeline>,
+    _guard: GateGuard,
+}
+
 struct PageServerHandler {
    _conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
@@ -285,6 +299,14 @@ struct PageServerHandler {
    /// For each query received over the connection,
    /// `process_query` creates a child context from this one.
    connection_ctx: RequestContext,
+
+    /// See [`Self::cache_timeline`] for usage.
+    ///
+    /// Note on size: the typical size of this map is 1.  The largest size we expect
+    /// to see is the number of shards divided by the number of pageservers (typically < 2),
+    /// or the ratio used when splitting shards (i.e. how many children created from one)
+    /// parent shard, where a "large" number might be ~8.
+    shard_timelines: HashMap<ShardIndex, HandlerTimeline>,
 }

 #[derive(thiserror::Error, Debug)]
@@ -358,13 +380,46 @@ impl PageServerHandler {
            auth,
            claims: None,
            connection_ctx,
+            shard_timelines: HashMap::new(),
        }
    }

-    /// Wrap PostgresBackend::flush to respect our CancellationToken: it is important to use
-    /// this rather than naked flush() in order to shut down promptly.  Without this, we would
-    /// block shutdown of a tenant if a postgres client was failing to consume bytes we send
-    /// in the flush.
+    /// Analogous to calling cancelled() on a Timeline's cancellation token: waits for cancellation.
+    ///
+    /// We use many Timeline objects, and hold GateGuards on all of them.  We must therefore respect
+    /// all of their cancellation tokens.
+    async fn timeline_cancelled(&self) {
+        // A short wait before we expend the cycles to walk our timeline map.  This avoids incurring
+        // that cost every time we check for cancellation.
+        tokio::time::sleep(Duration::from_millis(10)).await;
+
+        // This function is never called concurrently with code that adds timelines to shard_timelines,
+        // which is enforced by the borrow checker (the future returned by this function carries the
+        // immutable &self).  So it's fine to evaluate shard_timelines after the sleep, we don't risk
+        // missing any inserts to the map.
+
+        let mut futs = self
+            .shard_timelines
+            .values()
+            .map(|ht| ht.timeline.cancel.cancelled())
+            .collect::<FuturesUnordered<_>>();
+
+        futs.next().await;
+    }
+
+    /// Analogous to calling is_cancelled() on a Timeline's cancellation token
+    fn timeline_is_cancelled(&self) -> bool {
+        self.shard_timelines
+            .values()
+            .any(|ht| ht.timeline.cancel.is_cancelled() || ht.timeline.is_stopping())
+    }
+
+    /// This function always respects cancellation of any timeline in `[Self::shard_timelines]`.  Pass in
+    /// a cancellation token at the next scope up (such as a tenant cancellation token) to ensure we respect
+    /// cancellation if there aren't any timelines in the cache.
+    ///
+    /// If calling from a function that doesn't use the `[Self::shard_timelines]` cache, then pass in the
+    /// timeline cancellation token.
    async fn flush_cancellable<IO>(
        &self,
        pgb: &mut PostgresBackend<IO>,
@@ -377,6 +432,9 @@ impl PageServerHandler {
            flush_r = pgb.flush() => {
                Ok(flush_r?)
            },
+            _ = self.timeline_cancelled() => {
+                Err(QueryError::Shutdown)
+            }
            _ = cancel.cancelled() => {
                Err(QueryError::Shutdown)
            }
@@ -452,7 +510,7 @@ impl PageServerHandler {

    #[instrument(skip_all)]
    async fn handle_pagerequests<IO>(
-        &self,
+        &mut self,
        pgb: &mut PostgresBackend<IO>,
        tenant_id: TenantId,
        timeline_id: TimelineId,
@@ -463,10 +521,6 @@ impl PageServerHandler {
    {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        // Note that since one connection may contain getpage requests that target different
-        // shards (e.g. during splitting when the compute is not yet aware of the split), the tenant
-        // that we look up here may not be the one that serves all the actual requests: we will double
-        // check the mapping of key->shard later before calling into Timeline for getpage requests.
        let tenant = mgr::get_active_tenant_with_timeout(
            tenant_id,
            ShardSelector::First,
@@ -487,27 +541,15 @@ impl PageServerHandler {
            None
        };

-        // Check that the timeline exists
-        let timeline = tenant
-            .get_timeline(timeline_id, true)
-            .map_err(|e| QueryError::NotFound(format!("{e}").into()))?;
-
-        // Avoid starting new requests if the timeline has already started shutting down,
-        // and block timeline shutdown until this request is complete, or drops out due
-        // to cancellation.
-        let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
-
        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
-        self.flush_cancellable(pgb, &timeline.cancel).await?;
-
-        let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
+        self.flush_cancellable(pgb, &tenant.cancel).await?;

        loop {
            let msg = tokio::select! {
                biased;

-                _ = timeline.cancel.cancelled() => {
+                _ = self.timeline_cancelled() => {
                    // We were requested to shut down.
                    info!("shutdown request received in page handler");
                    return Err(QueryError::Shutdown)
@@ -541,40 +583,36 @@ impl PageServerHandler {

            let (response, span) = match neon_fe_msg {
                PagestreamFeMessage::Exists(req) => {
-                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelExists);
                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.lsn);
                    (
-                        self.handle_get_rel_exists_request(&timeline, &req, &ctx)
+                        self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
                            .await,
                        span,
                    )
                }
                PagestreamFeMessage::Nblocks(req) => {
-                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelSize);
                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.lsn);
                    (
-                        self.handle_get_nblocks_request(&timeline, &req, &ctx)
+                        self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
                            .await,
                        span,
                    )
                }
                PagestreamFeMessage::GetPage(req) => {
-                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn);
                    (
-                        self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
+                        self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
                            .await,
                        span,
                    )
                }
                PagestreamFeMessage::DbSize(req) => {
-                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetDbSize);
                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.lsn);
                    (
-                        self.handle_db_size_request(&timeline, &req, &ctx)
+                        self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
                            .await,
                        span,
@@ -594,7 +632,7 @@ impl PageServerHandler {
                    span.in_scope(|| info!("handler requested reconnect: {reason}"));
                    return Err(QueryError::Reconnect);
                }
-                Err(e) if timeline.cancel.is_cancelled() || timeline.is_stopping() => {
+                Err(e) if self.timeline_is_cancelled() => {
                    // This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean
                    // shutdown error, this may be buried inside a PageReconstructError::Other for example.
                    //
@@ -617,7 +655,7 @@ impl PageServerHandler {
                    });

                    pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
-                    self.flush_cancellable(pgb, &timeline.cancel).await?;
+                    self.flush_cancellable(pgb, &tenant.cancel).await?;
                }
            }
        }
@@ -814,11 +852,17 @@ impl PageServerHandler {
    }

    async fn handle_get_rel_exists_request(
-        &self,
-        timeline: &Timeline,
+        &mut self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
        req: &PagestreamExistsRequest,
        ctx: &RequestContext,
    ) -> Result<PagestreamBeMessage, PageStreamError> {
+        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
+        let _timer = timeline
+            .query_metrics
+            .start_timer(metrics::SmgrQueryType::GetRelExists);
+
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
@@ -834,11 +878,18 @@ impl PageServerHandler {
    }

    async fn handle_get_nblocks_request(
-        &self,
-        timeline: &Timeline,
+        &mut self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
        req: &PagestreamNblocksRequest,
        ctx: &RequestContext,
    ) -> Result<PagestreamBeMessage, PageStreamError> {
+        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
+
+        let _timer = timeline
+            .query_metrics
+            .start_timer(metrics::SmgrQueryType::GetRelSize);
+
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
@@ -854,11 +905,18 @@ impl PageServerHandler {
    }

    async fn handle_db_size_request(
-        &self,
-        timeline: &Timeline,
+        &mut self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
        req: &PagestreamDbSizeRequest,
        ctx: &RequestContext,
    ) -> Result<PagestreamBeMessage, PageStreamError> {
+        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
+
+        let _timer = timeline
+            .query_metrics
+            .start_timer(metrics::SmgrQueryType::GetDbSize);
+
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
@@ -880,16 +938,164 @@ impl PageServerHandler {
        }))
    }

-    async fn do_handle_get_page_at_lsn_request(
-        &self,
-        timeline: &Timeline,
+    /// For most getpage requests, we will already have a Timeline to serve the request: this function
+    /// looks up such a Timeline synchronously and without touching any global state.
+    fn get_cached_timeline_for_page(
+        &mut self,
+        req: &PagestreamGetPageRequest,
+    ) -> Result<&Arc<Timeline>, Key> {
+        let key = if let Some((first_idx, first_timeline)) = self.shard_timelines.iter().next() {
+            // Fastest path: single sharded case
+            if first_idx.shard_count < ShardCount(2) {
+                return Ok(&first_timeline.timeline);
+            }
+
+            let key = rel_block_to_key(req.rel, req.blkno);
+            let shard_num = first_timeline
+                .timeline
+                .get_shard_identity()
+                .get_shard_number(&key);
+
+            // Fast path: matched the first timeline in our local handler map.  This case is common if
+            // only one shard per tenant is attached to this pageserver.
+            if first_timeline.timeline.get_shard_identity().number == shard_num {
+                return Ok(&first_timeline.timeline);
+            }
+
+            let shard_index = ShardIndex {
+                shard_number: shard_num,
+                shard_count: first_timeline.timeline.get_shard_identity().count,
+            };
+
+            // Fast-ish path: timeline is in the connection handler's local cache
+            if let Some(found) = self.shard_timelines.get(&shard_index) {
+                return Ok(&found.timeline);
+            }
+
+            key
+        } else {
+            rel_block_to_key(req.rel, req.blkno)
+        };
+
+        Err(key)
+    }
+
+    /// Having looked up the [`Timeline`] instance for a particular shard, cache it to enable
+    /// use in future requests without having to traverse [`crate::tenant::mgr::TenantManager`]
+    /// again.
+    ///
+    /// Note that all the Timelines in this cache are for the same timeline_id: they're differ
+    /// in which shard they belong to.  When we serve a getpage@lsn request, we choose a shard
+    /// based on key.
+    ///
+    /// The typical size of this cache is 1, as we generally create shards to distribute work
+    /// across pageservers, so don't tend to have multiple shards for the same tenant on the
+    /// same pageserver.
+    fn cache_timeline(
+        &mut self,
+        timeline: Arc<Timeline>,
+    ) -> Result<&Arc<Timeline>, GetActiveTimelineError> {
+        let gate_guard = timeline
+            .gate
+            .enter()
+            .map_err(|_| GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled))?;
+
+        let shard_index = timeline.tenant_shard_id.to_index();
+        let entry = self
+            .shard_timelines
+            .entry(shard_index)
+            .or_insert(HandlerTimeline {
+                timeline,
+                _guard: gate_guard,
+            });
+
+        Ok(&entry.timeline)
+    }
+
+    /// If [`Self::get_cached_timeline_for_page`] missed, then this function is used to populate the cache with
+    /// a Timeline to serve requests for this key, if such a Timeline is present on this pageserver.  If no such
+    /// Timeline is found, then we will return an error (this indicates that the client is talking to the wrong node).
+    async fn load_timeline_for_page(
+        &mut self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        key: Key,
+    ) -> anyhow::Result<&Arc<Timeline>, GetActiveTimelineError> {
+        // Slow path: we must call out to the TenantManager to find the timeline for this Key
+        let timeline = self
+            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Page(key))
+            .await?;
+
+        self.cache_timeline(timeline)
+    }
+
+    async fn get_timeline_shard_zero(
+        &mut self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> anyhow::Result<&Arc<Timeline>, GetActiveTimelineError> {
+        // This is a borrow-checker workaround: we can't return from inside of the  `if let Some` because
+        // that would be an immutable-borrow-self return, whereas later in the function we will use a mutable
+        // ref to salf.  So instead, we first build a bool, and then return while not borrowing self.
+        let have_cached = if let Some((idx, _tl)) = self.shard_timelines.iter().next() {
+            idx.shard_number == ShardNumber(0)
+        } else {
+            false
+        };
+
+        if have_cached {
+            let entry = self.shard_timelines.iter().next().unwrap();
+            Ok(&entry.1.timeline)
+        } else {
+            let timeline = self
+                .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+                .await?;
+            Ok(self.cache_timeline(timeline)?)
+        }
+    }
+
+    async fn handle_get_page_at_lsn_request(
+        &mut self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
        req: &PagestreamGetPageRequest,
        ctx: &RequestContext,
    ) -> Result<PagestreamBeMessage, PageStreamError> {
+        let timeline = match self.get_cached_timeline_for_page(req) {
+            Ok(tl) => tl,
+            Err(key) => {
+                match self
+                    .load_timeline_for_page(tenant_id, timeline_id, key)
+                    .await
+                {
+                    Ok(t) => t,
+                    Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
+                        // We already know this tenant exists in general, because we resolved it at
+                        // start of connection.  Getting a NotFound here indicates that the shard containing
+                        // the requested page is not present on this node: the client's knowledge of shard->pageserver
+                        // mapping is out of date.
+                        //
+                        // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
+                        // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
+                        // and talk to a different pageserver.
+                        return Err(PageStreamError::Reconnect(
+                            "getpage@lsn request routed to wrong shard".into(),
+                        ));
+                    }
+                    Err(e) => return Err(e.into()),
+                }
+            }
+        };
+
+        let _timer = timeline
+            .query_metrics
+            .start_timer(metrics::SmgrQueryType::GetPageAtLsn);
+
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                .await?;
+
        let page = timeline
            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
            .await?;
@@ -899,60 +1105,6 @@ impl PageServerHandler {
        }))
    }

-    async fn handle_get_page_at_lsn_request(
-        &self,
-        timeline: &Timeline,
-        req: &PagestreamGetPageRequest,
-        ctx: &RequestContext,
-    ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let key = rel_block_to_key(req.rel, req.blkno);
-        if timeline.get_shard_identity().is_key_local(&key) {
-            self.do_handle_get_page_at_lsn_request(timeline, req, ctx)
-                .await
-        } else {
-            // The Tenant shard we looked up at connection start does not hold this particular
-            // key: look for other shards in this tenant.  This scenario occurs if a pageserver
-            // has multiple shards for the same tenant.
-            //
-            // TODO: optimize this (https://github.com/neondatabase/neon/pull/6037)
-            let timeline = match self
-                .get_active_tenant_timeline(
-                    timeline.tenant_shard_id.tenant_id,
-                    timeline.timeline_id,
-                    ShardSelector::Page(key),
-                )
-                .await
-            {
-                Ok(t) => t,
-                Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
-                    // We already know this tenant exists in general, because we resolved it at
-                    // start of connection.  Getting a NotFound here indicates that the shard containing
-                    // the requested page is not present on this node: the client's knowledge of shard->pageserver
-                    // mapping is out of date.
-                    tracing::info!("Page request routed to wrong shard: my identity {:?}, should go to shard {}, key {}",
-                        timeline.get_shard_identity(), timeline.get_shard_identity().get_shard_number(&key).0, key);
-                    // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
-                    // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
-                    // and talk to a different pageserver.
-                    return Err(PageStreamError::Reconnect(
-                        "getpage@lsn request routed to wrong shard".into(),
-                    ));
-                }
-                Err(e) => return Err(e.into()),
-            };
-
-            // Take a GateGuard for the duration of this request.  If we were using our main Timeline object,
-            // the GateGuard was already held over the whole connection.
-            let _timeline_guard = timeline
-                .gate
-                .enter()
-                .map_err(|_| PageStreamError::Shutdown)?;
-
-            self.do_handle_get_page_at_lsn_request(&timeline, req, ctx)
-                .await
-        }
-    }
-
    #[allow(clippy::too_many_arguments)]
    #[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))]
    async fn handle_basebackup_request<IO>(
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -14,7 +14,7 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes};
 use pageserver_api::key::is_rel_block_key;
-use pageserver_api::reltag::{RelTag, SlruKind};
+use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
 use postgres_ffi::{Oid, TimestampTz, TransactionId};
@@ -27,9 +27,6 @@ use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::{bin_ser::BeSer, lsn::Lsn};

-/// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type.
-pub type BlockNumber = u32;
-
 #[derive(Debug)]
 pub enum LsnForTimestamp {
    /// Found commits both before and after the given timestamp
@@ -1863,21 +1860,6 @@ pub fn is_inherited_key(key: Key) -> bool {
    key != AUX_FILES_KEY
 }

-/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
-pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
-    Ok(match key.field1 {
-        0x00 => (
-            RelTag {
-                spcnode: key.field2,
-                dbnode: key.field3,
-                relnode: key.field4,
-                forknum: key.field5,
-            },
-            key.field6,
-        ),
-        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
-    })
-}
 pub fn is_rel_fsm_block_key(key: Key) -> bool {
    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -13,10 +13,12 @@

 use anyhow::{bail, Context};
 use camino::Utf8Path;
+use camino::Utf8PathBuf;
 use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
 use futures::FutureExt;
 use futures::StreamExt;
+use pageserver_api::models;
 use pageserver_api::models::TimelineState;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::TenantShardId;
@@ -72,6 +74,7 @@ use crate::tenant::config::LocationMode;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
 pub use crate::tenant::remote_timeline_client::index::IndexPart;
+use crate::tenant::remote_timeline_client::remote_initdb_archive_path;
 use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
 use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::storage_layer::DeltaLayer;
@@ -109,7 +112,7 @@ use toml_edit;
 use utils::{
    crashsafe,
    generation::Generation,
-    id::{TenantId, TimelineId},
+    id::TimelineId,
    lsn::{Lsn, RecordLsn},
 };

@@ -368,13 +371,13 @@ impl WalRedoManager {
 pub enum GetTimelineError {
    #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
    NotActive {
-        tenant_id: TenantId,
+        tenant_id: TenantShardId,
        timeline_id: TimelineId,
        state: TimelineState,
    },
    #[error("Timeline {tenant_id}/{timeline_id} was not found")]
    NotFound {
-        tenant_id: TenantId,
+        tenant_id: TenantShardId,
        timeline_id: TimelineId,
    },
 }
@@ -1514,10 +1517,6 @@ impl Tenant {
            .map_err(LoadLocalTimelineError::Load)
    }

-    pub(crate) fn tenant_id(&self) -> TenantId {
-        self.tenant_shard_id.tenant_id
-    }
-
    pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
        self.tenant_shard_id
    }
@@ -1533,13 +1532,13 @@ impl Tenant {
        let timeline = timelines_accessor
            .get(&timeline_id)
            .ok_or(GetTimelineError::NotFound {
-                tenant_id: self.tenant_shard_id.tenant_id,
+                tenant_id: self.tenant_shard_id,
                timeline_id,
            })?;

        if active_only && !timeline.is_active() {
            Err(GetTimelineError::NotActive {
-                tenant_id: self.tenant_shard_id.tenant_id,
+                tenant_id: self.tenant_shard_id,
                timeline_id,
                state: timeline.current_state(),
            })
@@ -1930,6 +1929,10 @@ impl Tenant {
        self.current_state() == TenantState::Active
    }

+    pub fn generation(&self) -> Generation {
+        self.generation
+    }
+
    /// Changes tenant status to active, unless shutdown was already requested.
    ///
    /// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup
@@ -2319,6 +2322,32 @@ impl Tenant {
            .clone()
    }

+    /// For API access: generate a LocationConfig equivalent to the one that would be used to
+    /// create a Tenant in the same state.  Do not use this in hot paths: it's for relatively
+    /// rare external API calls, like a reconciliation at startup.
+    pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
+        let conf = self.tenant_conf.read().unwrap();
+
+        let location_config_mode = match conf.location.attach_mode {
+            AttachmentMode::Single => models::LocationConfigMode::AttachedSingle,
+            AttachmentMode::Multi => models::LocationConfigMode::AttachedMulti,
+            AttachmentMode::Stale => models::LocationConfigMode::AttachedStale,
+        };
+
+        // We have a pageserver TenantConf, we need the API-facing TenantConfig.
+        let tenant_config: models::TenantConfig = conf.tenant_conf.into();
+
+        models::LocationConfig {
+            mode: location_config_mode,
+            generation: self.generation.into(),
+            secondary_conf: None,
+            shard_number: self.shard_identity.number.0,
+            shard_count: self.shard_identity.count.0,
+            shard_stripe_size: self.shard_identity.stripe_size.0,
+            tenant_conf: tenant_config,
+        }
+    }
+
    pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
        &self.tenant_shard_id
    }
@@ -2564,7 +2593,9 @@ impl Tenant {
        let (state, mut rx) = watch::channel(state);

        tokio::spawn(async move {
+            // Strings for metric labels
            let tid = tenant_shard_id.to_string();
+            let shard_id_str = format!("{}", tenant_shard_id.shard_slug());

            fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
                ([state.into()], matches!(state, TenantState::Broken { .. }))
@@ -2577,13 +2608,15 @@ impl Tenant {
                // the tenant might be ignored and reloaded, so first remove any previous set
                // element. it most likely has already been scraped, as these are manual operations
                // right now. most likely we will add it back very soon.
-                drop(crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid]));
+                drop(
+                    crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid, &shard_id_str]),
+                );
                false
            } else {
                // add the id to the set right away, there should not be any updates on the channel
                // after
                crate::metrics::BROKEN_TENANTS_SET
-                    .with_label_values(&[&tid])
+                    .with_label_values(&[&tid, &shard_id_str])
                    .set(1);
                true
            };
@@ -2609,7 +2642,7 @@ impl Tenant {
                    counted_broken = true;
                    // insert the tenant_id (back) into the set
                    crate::metrics::BROKEN_TENANTS_SET
-                        .with_label_values(&[&tid])
+                        .with_label_values(&[&tid, &shard_id_str])
                        .inc();
                }
            }
@@ -2669,10 +2702,11 @@ impl Tenant {
                }
            }

-            // Legacy configs are implicitly in attached state
+            // Legacy configs are implicitly in attached state, and do not support sharding
            Ok(LocationConf::attached_single(
                tenant_conf,
                Generation::none(),
+                &models::ShardParameters::default(),
            ))
        } else {
            // FIXME If the config file is not found, assume that we're attaching
@@ -3177,6 +3211,55 @@ impl Tenant {
        .await
    }

+    async fn upload_initdb(
+        &self,
+        timelines_path: &Utf8PathBuf,
+        pgdata_path: &Utf8PathBuf,
+        timeline_id: &TimelineId,
+    ) -> anyhow::Result<()> {
+        let Some(storage) = &self.remote_storage else {
+            // No remote storage?  No upload.
+            return Ok(());
+        };
+
+        let temp_path = timelines_path.join(format!(
+            "{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
+        ));
+
+        scopeguard::defer! {
+            if let Err(e) = fs::remove_file(&temp_path) {
+                error!("Failed to remove temporary initdb archive '{temp_path}': {e}");
+            }
+        }
+
+        let (pgdata_zstd, tar_zst_size) =
+            import_datadir::create_tar_zst(pgdata_path, &temp_path).await?;
+
+        pausable_failpoint!("before-initdb-upload");
+
+        backoff::retry(
+            || async {
+                self::remote_timeline_client::upload_initdb_dir(
+                    storage,
+                    &self.tenant_shard_id.tenant_id,
+                    timeline_id,
+                    pgdata_zstd.try_clone().await?,
+                    tar_zst_size,
+                    &self.cancel,
+                )
+                .await
+            },
+            |_| false,
+            3,
+            u32::MAX,
+            "persist_initdb_tar_zst",
+            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
+        )
+        .await?;
+
+        Ok(())
+    }
+
    /// - run initdb to init temporary instance and get bootstrap data
    /// - after initialization completes, tar up the temp dir and upload it to S3.
    ///
@@ -3216,6 +3299,18 @@ impl Tenant {
            let Some(storage) = &self.remote_storage else {
                bail!("no storage configured but load_existing_initdb set to {existing_initdb_timeline_id}");
            };
+            if existing_initdb_timeline_id != timeline_id {
+                let source_path = &remote_initdb_archive_path(
+                    &self.tenant_shard_id.tenant_id,
+                    &existing_initdb_timeline_id,
+                );
+                let dest_path =
+                    &remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id);
+                storage
+                    .copy_object(source_path, dest_path)
+                    .await
+                    .context("copy initdb tar")?;
+            }
            let (initdb_tar_zst_path, initdb_tar_zst) =
                self::remote_timeline_client::download_initdb_tar_zst(
                    self.conf,
@@ -3226,66 +3321,26 @@ impl Tenant {
                )
                .await
                .context("download initdb tar")?;
+
+            scopeguard::defer! {
+                if let Err(e) = fs::remove_file(&initdb_tar_zst_path) {
+                    error!("Failed to remove temporary initdb archive '{initdb_tar_zst_path}': {e}");
+                }
+            }
+
            let buf_read =
                BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
            import_datadir::extract_tar_zst(&pgdata_path, buf_read)
                .await
                .context("extract initdb tar")?;
-
-            tokio::fs::remove_file(&initdb_tar_zst_path)
-                .await
-                .or_else(|e| {
-                    if e.kind() == std::io::ErrorKind::NotFound {
-                        // If something else already removed the file, ignore the error
-                        Ok(())
-                    } else {
-                        Err(e)
-                    }
-                })
-                .with_context(|| format!("tempfile removal {initdb_tar_zst_path}"))?;
        } else {
-            // Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path
+            // Init temporarily repo to get bootstrap data, this creates a directory in the `pgdata_path` path
            run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;

            // Upload the created data dir to S3
-            if let Some(storage) = &self.remote_storage {
-                let temp_path = timelines_path.join(format!(
-                    "{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
-                ));
-
-                let (pgdata_zstd, tar_zst_size) =
-                    import_datadir::create_tar_zst(&pgdata_path, &temp_path).await?;
-                backoff::retry(
-                    || async {
-                        self::remote_timeline_client::upload_initdb_dir(
-                            storage,
-                            &self.tenant_shard_id.tenant_id,
-                            &timeline_id,
-                            pgdata_zstd.try_clone().await?,
-                            tar_zst_size,
-                            &self.cancel,
-                        )
-                        .await
-                    },
-                    |_| false,
-                    3,
-                    u32::MAX,
-                    "persist_initdb_tar_zst",
-                    backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
-                )
-                .await?;
-
-                tokio::fs::remove_file(&temp_path)
-                    .await
-                    .or_else(|e| {
-                        if e.kind() == std::io::ErrorKind::NotFound {
-                            // If something else already removed the file, ignore the error
-                            Ok(())
-                        } else {
-                            Err(e)
-                        }
-                    })
-                    .with_context(|| format!("tempfile removal {temp_path}"))?;
+            if self.tenant_shard_id().is_zero() {
+                self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id)
+                    .await?;
            }
        }
        let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align();
@@ -3574,6 +3629,9 @@ impl Tenant {
        self.cached_synthetic_tenant_size
            .store(size, Ordering::Relaxed);

+        // Only shard zero should be calculating synthetic sizes
+        debug_assert!(self.shard_identity.is_zero());
+
        TENANT_SYNTHETIC_SIZE_METRIC
            .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
            .unwrap()
@@ -3725,7 +3783,7 @@ async fn run_initdb(

 impl Drop for Tenant {
    fn drop(&mut self) {
-        remove_tenant_metrics(&self.tenant_shard_id.tenant_id);
+        remove_tenant_metrics(&self.tenant_shard_id);
    }
 }
 /// Dump contents of a layer file to stdout.
@@ -3764,6 +3822,7 @@ pub(crate) mod harness {
    use bytes::{Bytes, BytesMut};
    use camino::Utf8PathBuf;
    use once_cell::sync::OnceCell;
+    use pageserver_api::models::ShardParameters;
    use pageserver_api::shard::ShardIndex;
    use std::fs;
    use std::sync::Arc;
@@ -3948,6 +4007,7 @@ pub(crate) mod harness {
                AttachedTenantConf::try_from(LocationConf::attached_single(
                    TenantConfOpt::from(self.tenant_conf),
                    self.generation,
+                    &ShardParameters::default(),
                ))
                .unwrap(),
                // This is a legacy/test code path: sharding isn't supported here.
@@ -5151,7 +5211,7 @@ mod tests {
                assert_eq!(
                    e,
                    GetTimelineError::NotFound {
-                        tenant_id: tenant.tenant_shard_id.tenant_id,
+                        tenant_id: tenant.tenant_shard_id,
                        timeline_id: TIMELINE_ID,
                    }
                )
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -10,6 +10,7 @@
 //!
 use anyhow::bail;
 use pageserver_api::models;
+use pageserver_api::models::EvictionPolicy;
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
 use serde::{Deserialize, Serialize};
@@ -167,14 +168,17 @@ impl LocationConf {
    /// For use when loading from a legacy configuration: presence of a tenant
    /// implies it is in AttachmentMode::Single, which used to be the only
    /// possible state.  This function should eventually be removed.
-    pub(crate) fn attached_single(tenant_conf: TenantConfOpt, generation: Generation) -> Self {
+    pub(crate) fn attached_single(
+        tenant_conf: TenantConfOpt,
+        generation: Generation,
+        shard_params: &models::ShardParameters,
+    ) -> Self {
        Self {
            mode: LocationMode::Attached(AttachedLocationConfig {
                generation,
                attach_mode: AttachmentMode::Single,
            }),
-            // Legacy configuration loads are always from tenants created before sharding existed.
-            shard: ShardIdentity::unsharded(),
+            shard: ShardIdentity::from_params(ShardNumber(0), shard_params),
            tenant_conf,
        }
    }
@@ -428,30 +432,6 @@ pub struct TenantConfOpt {
    pub heatmap_period: Option<Duration>,
 }

-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(tag = "kind")]
-pub enum EvictionPolicy {
-    NoEviction,
-    LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
-}
-
-impl EvictionPolicy {
-    pub fn discriminant_str(&self) -> &'static str {
-        match self {
-            EvictionPolicy::NoEviction => "NoEviction",
-            EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
-        }
-    }
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-pub struct EvictionPolicyLayerAccessThreshold {
-    #[serde(with = "humantime_serde")]
-    pub period: Duration,
-    #[serde(with = "humantime_serde")]
-    pub threshold: Duration,
-}
-
 impl TenantConfOpt {
    pub fn merge(&self, global_conf: TenantConf) -> TenantConf {
        TenantConf {
@@ -576,6 +556,38 @@ impl TryFrom<toml_edit::Item> for TenantConfOpt {
    }
 }

+/// This is a conversion from our internal tenant config object to the one used
+/// in external APIs.
+impl From<TenantConfOpt> for models::TenantConfig {
+    fn from(value: TenantConfOpt) -> Self {
+        fn humantime(d: Duration) -> String {
+            format!("{}s", d.as_secs())
+        }
+        Self {
+            checkpoint_distance: value.checkpoint_distance,
+            checkpoint_timeout: value.checkpoint_timeout.map(humantime),
+            compaction_target_size: value.compaction_target_size,
+            compaction_period: value.compaction_period.map(humantime),
+            compaction_threshold: value.compaction_threshold,
+            gc_horizon: value.gc_horizon,
+            gc_period: value.gc_period.map(humantime),
+            image_creation_threshold: value.image_creation_threshold,
+            pitr_interval: value.pitr_interval.map(humantime),
+            walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime),
+            lagging_wal_timeout: value.lagging_wal_timeout.map(humantime),
+            max_lsn_wal_lag: value.max_lsn_wal_lag,
+            trace_read_requests: value.trace_read_requests,
+            eviction_policy: value.eviction_policy,
+            min_resident_size_override: value.min_resident_size_override,
+            evictions_low_residence_duration_metric_threshold: value
+                .evictions_low_residence_duration_metric_threshold
+                .map(humantime),
+            gc_feedback: value.gc_feedback,
+            heatmap_period: value.heatmap_period.map(humantime),
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -542,6 +542,7 @@ impl DeleteTenantFlow {
        )
        .await?;

+        pausable_failpoint!("tenant-delete-before-cleanup-remaining-fs-traces-pausable");
        fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
            Err(anyhow::anyhow!(
                "failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -3,7 +3,8 @@

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
 use pageserver_api::key::Key;
-use pageserver_api::shard::{ShardIdentity, ShardNumber, TenantShardId};
+use pageserver_api::models::ShardParameters;
+use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::collections::{BTreeMap, HashMap};
@@ -56,6 +57,7 @@ use super::TenantSharedResources;
 /// that way we avoid having to carefully switch a tenant's ingestion etc on and off during
 /// its lifetime, and we can preserve some important safety invariants like `Tenant` always
 /// having a properly acquired generation (Secondary doesn't need a generation)
+#[derive(Clone)]
 pub(crate) enum TenantSlot {
    Attached(Arc<Tenant>),
    Secondary(Arc<SecondaryTenant>),
@@ -476,6 +478,8 @@ pub async fn init_tenant_mgr(
                            tenant_shard_id,
                            TenantSlot::Secondary(SecondaryTenant::new(
                                tenant_shard_id,
+                                location_conf.shard,
+                                location_conf.tenant_conf,
                                secondary_config,
                            )),
                        );
@@ -760,6 +764,8 @@ pub(crate) enum SetNewTenantConfigError {
    GetTenant(#[from] GetTenantError),
    #[error(transparent)]
    Persist(anyhow::Error),
+    #[error(transparent)]
+    Other(anyhow::Error),
 }

 pub(crate) async fn set_new_tenant_config(
@@ -773,10 +779,21 @@ pub(crate) async fn set_new_tenant_config(
    info!("configuring tenant {tenant_id}");
    let tenant = get_tenant(tenant_shard_id, true)?;

+    if tenant.tenant_shard_id().shard_count > ShardCount(0) {
+        // Note that we use ShardParameters::default below.
+        return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
+            "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
+        )));
+    }
+
    // This is a legacy API that only operates on attached tenants: the preferred
    // API to use is the location_config/ endpoint, which lets the caller provide
    // the full LocationConf.
-    let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation);
+    let location_conf = LocationConf::attached_single(
+        new_tenant_conf,
+        tenant.generation,
+        &ShardParameters::default(),
+    );

    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf)
        .await
@@ -830,15 +847,13 @@ impl TenantManager {
                TenantState::Active => Ok(Arc::clone(tenant)),
                _ => {
                    if active_only {
-                        Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
+                        Err(GetTenantError::NotActive(tenant_shard_id))
                    } else {
                        Ok(Arc::clone(tenant))
                    }
                }
            },
-            Some(TenantSlot::InProgress(_)) => {
-                Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
-            }
+            Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
            None | Some(TenantSlot::Secondary(_)) => {
                Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
            }
@@ -907,6 +922,7 @@ impl TenantManager {
                    Some(TenantSlot::Secondary(secondary_tenant)),
                ) => {
                    secondary_tenant.set_config(secondary_conf);
+                    secondary_tenant.set_tenant_conf(&new_location_config.tenant_conf);
                    Some(FastPathModified::Secondary(secondary_tenant.clone()))
                }
                _ => {
@@ -1039,16 +1055,36 @@ impl TenantManager {

        let new_slot = match &new_location_config.mode {
            LocationMode::Secondary(secondary_config) => {
-                TenantSlot::Secondary(SecondaryTenant::new(tenant_shard_id, secondary_config))
+                let shard_identity = new_location_config.shard;
+                TenantSlot::Secondary(SecondaryTenant::new(
+                    tenant_shard_id,
+                    shard_identity,
+                    new_location_config.tenant_conf,
+                    secondary_config,
+                ))
            }
            LocationMode::Attached(_attach_config) => {
                let shard_identity = new_location_config.shard;
+
+                // Testing hack: if we are configured with no control plane, then drop the generation
+                // from upserts.  This enables creating generation-less tenants even though neon_local
+                // always uses generations when calling the location conf API.
+                let attached_conf = if cfg!(feature = "testing") {
+                    let mut conf = AttachedTenantConf::try_from(new_location_config)?;
+                    if self.conf.control_plane_api.is_none() {
+                        conf.location.generation = Generation::none();
+                    }
+                    conf
+                } else {
+                    AttachedTenantConf::try_from(new_location_config)?
+                };
+
                let tenant = tenant_spawn(
                    self.conf,
                    tenant_shard_id,
                    &tenant_path,
                    self.resources.clone(),
-                    AttachedTenantConf::try_from(new_location_config)?,
+                    attached_conf,
                    shard_identity,
                    None,
                    self.tenants,
@@ -1189,6 +1225,17 @@ impl TenantManager {
        }
    }

+    /// Total list of all tenant slots: this includes attached, secondary, and InProgress.
+    pub(crate) fn list(&self) -> Vec<(TenantShardId, TenantSlot)> {
+        let locked = self.tenants.read().unwrap();
+        match &*locked {
+            TenantsMap::Initializing => Vec::new(),
+            TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => {
+                map.iter().map(|(k, v)| (*k, v.clone())).collect()
+            }
+        }
+    }
+
    pub(crate) async fn delete_tenant(
        &self,
        tenant_shard_id: TenantShardId,
@@ -1257,10 +1304,13 @@ impl TenantManager {

 #[derive(Debug, thiserror::Error)]
 pub(crate) enum GetTenantError {
+    /// NotFound is a TenantId rather than TenantShardId, because this error type is used from
+    /// getters that use a TenantId and a ShardSelector, not just getters that target a specific shard.
    #[error("Tenant {0} not found")]
    NotFound(TenantId),
+
    #[error("Tenant {0} is not active")]
-    NotActive(TenantId),
+    NotActive(TenantShardId),
    /// Broken is logically a subset of NotActive, but a distinct error is useful as
    /// NotActive is usually a retryable state for API purposes, whereas Broken
    /// is a stuck error state
@@ -1293,15 +1343,13 @@ pub(crate) fn get_tenant(
            TenantState::Active => Ok(Arc::clone(tenant)),
            _ => {
                if active_only {
-                    Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
+                    Err(GetTenantError::NotActive(tenant_shard_id))
                } else {
                    Ok(Arc::clone(tenant))
                }
            }
        },
-        Some(TenantSlot::InProgress(_)) => {
-            Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
-        }
+        Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
        None | Some(TenantSlot::Secondary(_)) => {
            Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
        }
@@ -1377,7 +1425,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
            }
            Some(TenantSlot::Secondary(_)) => {
                return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
-                    tenant_id,
+                    tenant_shard_id,
                )))
            }
            Some(TenantSlot::InProgress(barrier)) => {
@@ -1416,7 +1464,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
                    Some(TenantSlot::Attached(tenant)) => tenant.clone(),
                    _ => {
                        return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
-                            tenant_id,
+                            tenant_shard_id,
                        )))
                    }
                }
@@ -1444,7 +1492,7 @@ pub(crate) enum DeleteTimelineError {
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum TenantStateError {
    #[error("Tenant {0} is stopping")]
-    IsStopping(TenantId),
+    IsStopping(TenantShardId),
    #[error(transparent)]
    SlotError(#[from] TenantSlotError),
    #[error(transparent)]
@@ -1629,8 +1677,8 @@ pub(crate) enum TenantMapListError {
 ///
 /// Get list of tenants, for the mgmt API
 ///
-pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>, TenantMapListError>
-{
+pub(crate) async fn list_tenants(
+) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
    let tenants = TENANTS.read().unwrap();
    let m = match &*tenants {
        TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
@@ -1638,7 +1686,9 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>,
    };
    Ok(m.iter()
        .filter_map(|(id, tenant)| match tenant {
-            TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
+            TenantSlot::Attached(tenant) => {
+                Some((*id, tenant.current_state(), tenant.generation()))
+            }
            TenantSlot::Secondary(_) => None,
            TenantSlot::InProgress(_) => None,
        })
@@ -2072,7 +2122,7 @@ where
                    // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
                    // wait for it but return an error right away because these are distinct requests.
                    slot_guard.revert();
-                    return Err(TenantStateError::IsStopping(tenant_shard_id.tenant_id));
+                    return Err(TenantStateError::IsStopping(tenant_shard_id));
                }
            }
            Some(tenant)
@@ -2201,7 +2251,6 @@ pub(crate) async fn immediate_gc(

 #[cfg(test)]
 mod tests {
-    use pageserver_api::shard::TenantShardId;
    use std::collections::BTreeMap;
    use std::sync::Arc;
    use tracing::{info_span, Instrument};
@@ -2222,7 +2271,7 @@ mod tests {

        // harness loads it to active, which is forced and nothing is running on the tenant

-        let id = TenantShardId::unsharded(t.tenant_id());
+        let id = t.tenant_shard_id();

        // tenant harness configures the logging and we cannot escape it
        let _e = info_span!("testing", tenant_id = %id).entered();
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -182,7 +182,7 @@

 pub(crate) mod download;
 pub mod index;
-mod upload;
+pub(crate) mod upload;

 use anyhow::Context;
 use camino::Utf8Path;
@@ -522,8 +522,6 @@ impl RemoteTimelineClient {
            cancel,
        )
        .measure_remote_op(
-            self.tenant_shard_id.tenant_id,
-            self.timeline_id,
            RemoteOpFileKind::Index,
            RemoteOpKind::Download,
            Arc::clone(&self.metrics),
@@ -566,8 +564,6 @@ impl RemoteTimelineClient {
                cancel,
            )
            .measure_remote_op(
-                self.tenant_shard_id.tenant_id,
-                self.timeline_id,
                RemoteOpFileKind::Layer,
                RemoteOpKind::Download,
                Arc::clone(&self.metrics),
@@ -691,7 +687,10 @@ impl RemoteTimelineClient {
            .insert(layer.layer_desc().filename(), metadata.clone());
        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;

-        info!("scheduled layer file upload {layer}");
+        info!(
+            "scheduled layer file upload {layer} gen={:?} shard={:?}",
+            metadata.generation, metadata.shard
+        );
        let op = UploadOp::UploadLayer(layer, metadata);
        self.calls_unfinished_metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
@@ -1348,8 +1347,6 @@ impl RemoteTimelineClient {
                        &self.cancel,
                    )
                    .measure_remote_op(
-                        self.tenant_shard_id.tenant_id,
-                        self.timeline_id,
                        RemoteOpFileKind::Layer,
                        RemoteOpKind::Upload,
                        Arc::clone(&self.metrics),
@@ -1375,8 +1372,6 @@ impl RemoteTimelineClient {
                        &self.cancel,
                    )
                    .measure_remote_op(
-                        self.tenant_shard_id.tenant_id,
-                        self.timeline_id,
                        RemoteOpFileKind::Index,
                        RemoteOpKind::Upload,
                        Arc::clone(&self.metrics),
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -3,22 +3,36 @@ pub mod heatmap;
 mod heatmap_uploader;
 mod scheduler;

-use std::sync::Arc;
+use std::{sync::Arc, time::SystemTime};

-use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
+use crate::{
+    config::PageServerConf,
+    disk_usage_eviction_task::DiskUsageEvictionInfo,
+    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
+    virtual_file::MaybeFatalIo,
+};

 use self::{
    downloader::{downloader_task, SecondaryDetail},
    heatmap_uploader::heatmap_uploader_task,
 };

-use super::{config::SecondaryLocationConfig, mgr::TenantManager};
+use super::{
+    config::{SecondaryLocationConfig, TenantConfOpt},
+    mgr::TenantManager,
+    span::debug_assert_current_span_has_tenant_id,
+    storage_layer::LayerFileName,
+};

-use pageserver_api::shard::TenantShardId;
+use pageserver_api::{
+    models,
+    shard::{ShardIdentity, TenantShardId},
+};
 use remote_storage::GenericRemoteStorage;

 use tokio_util::sync::CancellationToken;
-use utils::{completion::Barrier, sync::gate::Gate};
+use tracing::instrument;
+use utils::{completion::Barrier, fs_ext, id::TimelineId, sync::gate::Gate};

 enum DownloadCommand {
    Download(TenantShardId),
@@ -75,12 +89,20 @@ pub(crate) struct SecondaryTenant {

    pub(crate) gate: Gate,

+    // Secondary mode does not need the full shard identity or the TenantConfOpt.  However,
+    // storing these enables us to report our full LocationConf, enabling convenient reconciliation
+    // by the control plane (see [`Self::get_location_conf`])
+    shard_identity: ShardIdentity,
+    tenant_conf: std::sync::Mutex<TenantConfOpt>,
+
    detail: std::sync::Mutex<SecondaryDetail>,
 }

 impl SecondaryTenant {
    pub(crate) fn new(
        tenant_shard_id: TenantShardId,
+        shard_identity: ShardIdentity,
+        tenant_conf: TenantConfOpt,
        config: &SecondaryLocationConfig,
    ) -> Arc<Self> {
        Arc::new(Self {
@@ -92,6 +114,9 @@ impl SecondaryTenant {
            cancel: CancellationToken::new(),
            gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")),

+            shard_identity,
+            tenant_conf: std::sync::Mutex::new(tenant_conf),
+
            detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
        })
    }
@@ -107,9 +132,91 @@ impl SecondaryTenant {
        self.detail.lock().unwrap().config = config.clone();
    }

-    fn get_tenant_shard_id(&self) -> &TenantShardId {
+    pub(crate) fn set_tenant_conf(&self, config: &TenantConfOpt) {
+        *(self.tenant_conf.lock().unwrap()) = *config;
+    }
+
+    /// For API access: generate a LocationConfig equivalent to the one that would be used to
+    /// create a Tenant in the same state.  Do not use this in hot paths: it's for relatively
+    /// rare external API calls, like a reconciliation at startup.
+    pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
+        let conf = self.detail.lock().unwrap().config.clone();
+
+        let conf = models::LocationConfigSecondary { warm: conf.warm };
+
+        let tenant_conf = *self.tenant_conf.lock().unwrap();
+        models::LocationConfig {
+            mode: models::LocationConfigMode::Secondary,
+            generation: None,
+            secondary_conf: Some(conf),
+            shard_number: self.tenant_shard_id.shard_number.0,
+            shard_count: self.tenant_shard_id.shard_count.0,
+            shard_stripe_size: self.shard_identity.stripe_size.0,
+            tenant_conf: tenant_conf.into(),
+        }
+    }
+
+    pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
        &self.tenant_shard_id
    }
+
+    pub(crate) fn get_layers_for_eviction(self: &Arc<Self>) -> DiskUsageEvictionInfo {
+        self.detail.lock().unwrap().get_layers_for_eviction(self)
+    }
+
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))]
+    pub(crate) async fn evict_layer(
+        &self,
+        conf: &PageServerConf,
+        timeline_id: TimelineId,
+        name: LayerFileName,
+    ) {
+        debug_assert_current_span_has_tenant_id();
+
+        let _guard = match self.gate.enter() {
+            Ok(g) => g,
+            Err(_) => {
+                tracing::debug!("Dropping layer evictions, secondary tenant shutting down",);
+                return;
+            }
+        };
+
+        let now = SystemTime::now();
+
+        let path = conf
+            .timeline_path(&self.tenant_shard_id, &timeline_id)
+            .join(name.file_name());
+
+        // We tolerate ENOENT, because between planning eviction and executing
+        // it, the secondary downloader could have seen an updated heatmap that
+        // resulted in a layer being deleted.
+        // Other local I/O errors are process-fatal: these should never happen.
+        tokio::fs::remove_file(path)
+            .await
+            .or_else(fs_ext::ignore_not_found)
+            .fatal_err("Deleting layer during eviction");
+
+        // Update the timeline's state.  This does not have to be synchronized with
+        // the download process, because:
+        // - If downloader is racing with us to remove a file (e.g. because it is
+        //   removed from heatmap), then our mutual .remove() operations will both
+        //   succeed.
+        // - If downloader is racing with us to download the object (this would require
+        //   multiple eviction iterations to race with multiple download iterations), then
+        //   if we remove it from the state, the worst that happens is the downloader
+        //   downloads it again before re-inserting, or we delete the file but it remains
+        //   in the state map (in which case it will be downloaded if this secondary
+        //   tenant transitions to attached and tries to access it)
+        //
+        // The important assumption here is that the secondary timeline state does not
+        // have to 100% match what is on disk, because it's a best-effort warming
+        // of the cache.
+        let mut detail = self.detail.lock().unwrap();
+        if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
+            timeline_detail.on_disk_layers.remove(&name);
+            timeline_detail.evicted_at.insert(name, now);
+        }
+    }
 }

 /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -8,6 +8,9 @@ use std::{

 use crate::{
    config::PageServerConf,
+    disk_usage_eviction_task::{
+        finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer,
+    },
    metrics::SECONDARY_MODE,
    tenant::{
        config::SecondaryLocationConfig,
@@ -142,6 +145,46 @@ impl SecondaryDetail {
            timelines: HashMap::new(),
        }
    }
+
+    pub(super) fn get_layers_for_eviction(
+        &self,
+        parent: &Arc<SecondaryTenant>,
+    ) -> DiskUsageEvictionInfo {
+        let mut result = DiskUsageEvictionInfo {
+            max_layer_size: None,
+            resident_layers: Vec::new(),
+        };
+        for (timeline_id, timeline_detail) in &self.timelines {
+            result
+                .resident_layers
+                .extend(timeline_detail.on_disk_layers.iter().map(|(name, ods)| {
+                    EvictionCandidate {
+                        layer: EvictionLayer::Secondary(EvictionSecondaryLayer {
+                            secondary_tenant: parent.clone(),
+                            timeline_id: *timeline_id,
+                            name: name.clone(),
+                            metadata: ods.metadata.clone(),
+                        }),
+                        last_activity_ts: ods.access_time,
+                        relative_last_activity: finite_f32::FiniteF32::ZERO,
+                    }
+                }));
+        }
+        result.max_layer_size = result
+            .resident_layers
+            .iter()
+            .map(|l| l.layer.get_file_size())
+            .max();
+
+        tracing::debug!(
+            "eviction: secondary tenant {} found {} timelines, {} layers",
+            parent.get_tenant_shard_id(),
+            self.timelines.len(),
+            result.resident_layers.len()
+        );
+
+        result
+    }
 }

 struct PendingDownload {
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -15,7 +15,7 @@ use utils::sync::heavier_once_cell;
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::repository::Key;
-use crate::tenant::{remote_timeline_client::LayerFileMetadata, RemoteTimelineClient, Timeline};
+use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};

 use super::delta_layer::{self, DeltaEntry};
 use super::image_layer;
@@ -204,17 +204,14 @@ impl Layer {
    ///
    /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation
    /// of download-evict cycle on retry.
-    pub(crate) async fn evict_and_wait(
-        &self,
-        rtc: &RemoteTimelineClient,
-    ) -> Result<(), EvictionError> {
-        self.0.evict_and_wait(rtc).await
+    pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
+        self.0.evict_and_wait().await
    }

    /// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload
    /// then.
    ///
-    /// On drop, this will cause a call to [`RemoteTimelineClient::schedule_deletion_of_unlinked`].
+    /// On drop, this will cause a call to [`crate::tenant::remote_timeline_client::RemoteTimelineClient::schedule_deletion_of_unlinked`].
    /// This means that the unlinking by [gc] or [compaction] must have happened strictly before
    /// the value this is called on gets dropped.
    ///
@@ -606,10 +603,7 @@ impl LayerInner {

    /// Cancellation safe, however dropping the future and calling this method again might result
    /// in a new attempt to evict OR join the previously started attempt.
-    pub(crate) async fn evict_and_wait(
-        &self,
-        _: &RemoteTimelineClient,
-    ) -> Result<(), EvictionError> {
+    pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
        use tokio::sync::broadcast::error::RecvError;

        assert!(self.have_remote_client);
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -15,9 +15,10 @@ use fail::fail_point;
 use itertools::Itertools;
 use pageserver_api::{
    models::{
-        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, LayerMapInfo,
-        TimelineState,
+        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
+        LayerMapInfo, TimelineState,
    },
+    reltag::BlockNumber,
    shard::{ShardIdentity, TenantShardId},
 };
 use rand::Rng;
@@ -42,33 +43,38 @@ use std::{
    ops::ControlFlow,
 };

-use crate::context::{
-    AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
-};
-use crate::tenant::storage_layer::delta_layer::DeltaEntry;
-use crate::tenant::storage_layer::{
-    AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
-    LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult,
-    ValueReconstructState,
-};
-use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
    layer_map::{LayerMap, SearchResult},
    metadata::{save_metadata, TimelineMetadata},
    par_fsync,
 };
+use crate::{
+    context::{AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder},
+    disk_usage_eviction_task::DiskUsageEvictionInfo,
+};
 use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError};
+use crate::{
+    disk_usage_eviction_task::finite_f32,
+    tenant::storage_layer::{
+        AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
+        LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult,
+        ValueReconstructState,
+    },
+};
+use crate::{
+    disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
+};
+use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};

 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
 use crate::metrics::{
    TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
 };
-use crate::pgdatadir_mapping::LsnForTimestamp;
+use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::pgdatadir_mapping::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
-use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
-use crate::tenant::config::{EvictionPolicy, TenantConfOpt};
+use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::ShardIndex;

@@ -246,6 +252,10 @@ pub struct Timeline {

    pub(super) metrics: TimelineMetrics,

+    // `Timeline` doesn't write these metrics itself, but it manages the lifetime.  Code
+    // in `crate::page_service` writes these metrics.
+    pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline,
+
    /// Ensures layers aren't frozen by checkpointer between
    /// [`Timeline::get_layer_for_write`] and layer reads.
    /// Locked automatically by [`TimelineWriter`] and checkpointer.
@@ -1134,12 +1144,7 @@ impl Timeline {
            return Ok(None);
        };

-        let rtc = self
-            .remote_client
-            .as_ref()
-            .ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?;
-
-        match local_layer.evict_and_wait(rtc).await {
+        match local_layer.evict_and_wait().await {
            Ok(()) => Ok(Some(true)),
            Err(EvictionError::NotFound) => Ok(Some(false)),
            Err(EvictionError::Downloaded) => Ok(Some(false)),
@@ -1314,6 +1319,11 @@ impl Timeline {
                    ),
                ),

+                query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new(
+                    &tenant_shard_id,
+                    &timeline_id,
+                ),
+
                flush_loop_state: Mutex::new(FlushLoopState::NotStarted),

                layer_flush_start_tx,
@@ -2103,7 +2113,7 @@ impl Timeline {
        let layer_file_names = eviction_info
            .resident_layers
            .iter()
-            .map(|l| l.layer.layer_desc().filename())
+            .map(|l| l.layer.get_name())
            .collect::<Vec<_>>();

        let decorated = match remote_client.get_layers_metadata(layer_file_names) {
@@ -2121,7 +2131,7 @@ impl Timeline {
        .filter_map(|(layer, remote_info)| {
            remote_info.map(|remote_info| {
                HeatMapLayer::new(
-                    layer.layer.layer_desc().filename(),
+                    layer.layer.get_name(),
                    IndexLayerMetadata::from(remote_info),
                    layer.last_activity_ts,
                )
@@ -4424,43 +4434,6 @@ impl Timeline {
    }
 }

-pub(crate) struct DiskUsageEvictionInfo {
-    /// Timeline's largest layer (remote or resident)
-    pub max_layer_size: Option<u64>,
-    /// Timeline's resident layers
-    pub resident_layers: Vec<LocalLayerInfoForDiskUsageEviction>,
-}
-
-pub(crate) struct LocalLayerInfoForDiskUsageEviction {
-    pub layer: Layer,
-    pub last_activity_ts: SystemTime,
-}
-
-impl std::fmt::Debug for LocalLayerInfoForDiskUsageEviction {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it
-        // having to allocate a string to this is bad, but it will rarely be formatted
-        let ts = chrono::DateTime::<chrono::Utc>::from(self.last_activity_ts);
-        let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true);
-        struct DisplayIsDebug<'a, T>(&'a T);
-        impl<'a, T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'a, T> {
-            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                write!(f, "{}", self.0)
-            }
-        }
-        f.debug_struct("LocalLayerInfoForDiskUsageEviction")
-            .field("layer", &DisplayIsDebug(&self.layer))
-            .field("last_activity", &ts)
-            .finish()
-    }
-}
-
-impl LocalLayerInfoForDiskUsageEviction {
-    pub fn file_size(&self) -> u64 {
-        self.layer.layer_desc().file_size
-    }
-}
-
 impl Timeline {
    /// Returns non-remote layers for eviction.
    pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
@@ -4494,9 +4467,10 @@ impl Timeline {
                SystemTime::now()
            });

-            resident_layers.push(LocalLayerInfoForDiskUsageEviction {
-                layer: l.drop_eviction_guard(),
+            resident_layers.push(EvictionCandidate {
+                layer: l.drop_eviction_guard().into(),
                last_activity_ts,
+                relative_last_activity: finite_f32::FiniteF32::ZERO,
            });
        }

@@ -4653,11 +4627,6 @@ mod tests {
            .await
            .unwrap();

-        let rtc = timeline
-            .remote_client
-            .clone()
-            .expect("just configured this");
-
        let layer = find_some_layer(&timeline).await;
        let layer = layer
            .keep_resident()
@@ -4666,8 +4635,8 @@ mod tests {
            .expect("should had been resident")
            .drop_eviction_guard();

-        let first = async { layer.evict_and_wait(&rtc).await };
-        let second = async { layer.evict_and_wait(&rtc).await };
+        let first = async { layer.evict_and_wait().await };
+        let second = async { layer.evict_and_wait().await };

        let (first, second) = tokio::join!(first, second);

--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -20,6 +20,7 @@ use std::{
    time::{Duration, SystemTime},
 };

+use pageserver_api::models::{EvictionPolicy, EvictionPolicyLayerAccessThreshold};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
@@ -29,10 +30,7 @@ use crate::{
    pgdatadir_mapping::CollectKeySpaceError,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
-        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
-        tasks::BackgroundLoopKind,
-        timeline::EvictionError,
-        LogicalSizeCalculationCause, Tenant,
+        tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
    },
 };

@@ -215,13 +213,10 @@ impl Timeline {

        // So, we just need to deal with this.

-        let remote_client = match self.remote_client.as_ref() {
-            Some(c) => c,
-            None => {
-                error!("no remote storage configured, cannot evict layers");
-                return ControlFlow::Continue(());
-            }
-        };
+        if self.remote_client.is_none() {
+            error!("no remote storage configured, cannot evict layers");
+            return ControlFlow::Continue(());
+        }

        let mut js = tokio::task::JoinSet::new();
        {
@@ -274,9 +269,8 @@ impl Timeline {
                };
                let layer = guard.drop_eviction_guard();
                if no_activity_for > p.threshold {
-                    let remote_client = remote_client.clone();
                    // this could cause a lot of allocations in some cases
-                    js.spawn(async move { layer.evict_and_wait(&remote_client).await });
+                    js.spawn(async move { layer.evict_and_wait().await });
                    stats.candidates += 1;
                }
            }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -14,6 +14,7 @@ use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC
 use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
+use pageserver_api::shard::TenantShardId;
 use std::fs::{self, File, OpenOptions};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
 use std::os::unix::fs::FileExt;
@@ -60,6 +61,7 @@ pub struct VirtualFile {
    // It makes no sense for us to constantly turn the `TimelineId` and `TenantId` into
    // strings.
    tenant_id: String,
+    shard_id: String,
    timeline_id: String,
 }

@@ -301,15 +303,24 @@ impl VirtualFile {
    ) -> Result<VirtualFile, std::io::Error> {
        let path_str = path.to_string();
        let parts = path_str.split('/').collect::<Vec<&str>>();
-        let tenant_id;
-        let timeline_id;
-        if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME {
-            tenant_id = parts[parts.len() - 4].to_string();
-            timeline_id = parts[parts.len() - 2].to_string();
-        } else {
-            tenant_id = "*".to_string();
-            timeline_id = "*".to_string();
-        }
+        let (tenant_id, shard_id, timeline_id) =
+            if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME {
+                let tenant_shard_part = parts[parts.len() - 4];
+                let (tenant_id, shard_id) = match tenant_shard_part.parse::<TenantShardId>() {
+                    Ok(tenant_shard_id) => (
+                        tenant_shard_id.tenant_id.to_string(),
+                        format!("{}", tenant_shard_id.shard_slug()),
+                    ),
+                    Err(_) => {
+                        // Malformed path: this ID is just for observability, so tolerate it
+                        // and pass through
+                        (tenant_shard_part.to_string(), "*".to_string())
+                    }
+                };
+                (tenant_id, shard_id, parts[parts.len() - 2].to_string())
+            } else {
+                ("*".to_string(), "*".to_string(), "*".to_string())
+            };
        let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;

        // NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
@@ -333,6 +344,7 @@ impl VirtualFile {
            path: path.to_path_buf(),
            open_options: reopen_options,
            tenant_id,
+            shard_id,
            timeline_id,
        };

@@ -574,7 +586,7 @@ impl VirtualFile {
            .read_at(buf, offset));
        if let Ok(size) = result {
            STORAGE_IO_SIZE
-                .with_label_values(&["read", &self.tenant_id, &self.timeline_id])
+                .with_label_values(&["read", &self.tenant_id, &self.shard_id, &self.timeline_id])
                .add(size as i64);
        }
        result
@@ -586,7 +598,7 @@ impl VirtualFile {
            .write_at(buf, offset));
        if let Ok(size) = result {
            STORAGE_IO_SIZE
-                .with_label_values(&["write", &self.tenant_id, &self.timeline_id])
+                .with_label_values(&["write", &self.tenant_id, &self.shard_id, &self.timeline_id])
                .add(size as i64);
        }
        result
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -38,7 +38,7 @@ use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::walrecord::*;
 use crate::ZERO_PAGE;
-use pageserver_api::reltag::{RelTag, SlruKind};
+use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
@@ -2201,7 +2201,8 @@ mod tests {
        let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
        let (tenant, ctx) = harness.load().await;

-        let remote_initdb_path = remote_initdb_archive_path(&tenant.tenant_id(), &TIMELINE_ID);
+        let remote_initdb_path =
+            remote_initdb_archive_path(&tenant.tenant_shard_id().tenant_id, &TIMELINE_ID);
        let initdb_path = harness.remote_fs_dir.join(remote_initdb_path.get_path());

        std::fs::create_dir_all(initdb_path.parent().unwrap())
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -47,9 +47,11 @@ use crate::metrics::{
    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM,
    WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
 };
-use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
+use crate::pgdatadir_mapping::key_to_slru_block;
 use crate::repository::Key;
 use crate::walrecord::NeonWalRecord;
+
+use pageserver_api::key::key_to_rel_block;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
--- a/pgxn/neon/relsize_cache.c
+++ b/pgxn/neon/relsize_cache.c
@@ -40,11 +40,23 @@ typedef struct
 {
 	RelTag		tag;
 	BlockNumber size;
+	dlist_node	lru_node;		/* LRU list node */
 } RelSizeEntry;

+typedef struct
+{
+	size_t      size;
+	uint64		hits;
+	uint64		misses;
+	uint64		writes;
+	dlist_head	lru;			/* double linked list for LRU replacement
+								 * algorithm */
+} RelSizeHashControl;
+
 static HTAB *relsize_hash;
 static LWLockId relsize_lock;
 static int	relsize_hash_size;
+static RelSizeHashControl* relsize_ctl;
 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
 #if PG_VERSION_NUM >= 150000
 static shmem_request_hook_type prev_shmem_request_hook = NULL;
@@ -52,7 +64,7 @@ static void relsize_shmem_request(void);
 #endif

 /*
- * Size of a cache entry is 20 bytes. So this default will take about 1.2 MB,
+ * Size of a cache entry is 36 bytes. So this default will take about 2.3 MB,
 * which seems reasonable.
 */
 #define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024)
@@ -61,19 +73,29 @@ static void
 neon_smgr_shmem_startup(void)
 {
 	static HASHCTL info;
+	bool found;

 	if (prev_shmem_startup_hook)
 		prev_shmem_startup_hook();

 	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
-	relsize_lock = (LWLockId) GetNamedLWLockTranche("neon_relsize");
-	info.keysize = sizeof(RelTag);
-	info.entrysize = sizeof(RelSizeEntry);
-	relsize_hash = ShmemInitHash("neon_relsize",
-								 relsize_hash_size, relsize_hash_size,
-								 &info,
-								 HASH_ELEM | HASH_BLOBS);
-	LWLockRelease(AddinShmemInitLock);
+	relsize_ctl = (RelSizeHashControl *) ShmemInitStruct("relsize_hash", sizeof(RelSizeHashControl), &found);
+	if (!found)
+	{
+		relsize_lock = (LWLockId) GetNamedLWLockTranche("neon_relsize");
+		info.keysize = sizeof(RelTag);
+		info.entrysize = sizeof(RelSizeEntry);
+		relsize_hash = ShmemInitHash("neon_relsize",
+									 relsize_hash_size, relsize_hash_size,
+									 &info,
+									 HASH_ELEM | HASH_BLOBS);
+		LWLockRelease(AddinShmemInitLock);
+		relsize_ctl->size = 0;
+		relsize_ctl->hits = 0;
+		relsize_ctl->misses = 0;
+		relsize_ctl->writes = 0;
+		dlist_init(&relsize_ctl->lru);
+	}
 }

 bool
@@ -93,7 +115,15 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
 		if (entry != NULL)
 		{
 			*size = entry->size;
+			relsize_ctl->hits += 1;
 			found = true;
+			/* Move entry to the LRU list tail */
+			dlist_delete(&entry->lru_node);
+			dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
+		}
+		else
+		{
+			relsize_ctl->misses += 1;
 		}
 		LWLockRelease(relsize_lock);
 	}
@@ -107,12 +137,43 @@ set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 	{
 		RelTag		tag;
 		RelSizeEntry *entry;
+		bool		found = false;

 		tag.rinfo = rinfo;
 		tag.forknum = forknum;
 		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
-		entry = hash_search(relsize_hash, &tag, HASH_ENTER, NULL);
+		/*
+		 * This should actually never happen! Below we check if hash is full and delete least recently user item in this case.
+		 * But for further safety we also perform check here.
+		 */
+		while ((entry = hash_search(relsize_hash, &tag, HASH_ENTER_NULL, &found)) == NULL)
+		{
+			RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
+			hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
+			Assert(relsize_ctl->size > 0);
+			relsize_ctl->size -= 1;
+		}
 		entry->size = size;
+		if (!found)
+		{
+			if (++relsize_ctl->size == relsize_hash_size)
+			{
+				/*
+				 * Remove least recently used elment from the hash.
+				 * Hash size after is becomes `relsize_hash_size-1`.
+				 * But it is not considered to be a problem, because size of this hash is expecrted large enough and +-1 doesn't matter.
+				 */
+				RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
+				hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
+				relsize_ctl->size -= 1;
+			}
+		}
+		else
+		{
+			dlist_delete(&entry->lru_node);
+		}
+		dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
+		relsize_ctl->writes += 1;
 		LWLockRelease(relsize_lock);
 	}
 }
@@ -132,6 +193,21 @@ update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 		entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found);
 		if (!found || entry->size < size)
 			entry->size = size;
+		if (!found)
+		{
+			if (++relsize_ctl->size == relsize_hash_size)
+			{
+				RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
+				hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
+				relsize_ctl->size -= 1;
+			}
+		}
+		else
+		{
+			dlist_delete(&entry->lru_node);
+		}
+		relsize_ctl->writes += 1;
+		dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
 		LWLockRelease(relsize_lock);
 	}
 }
@@ -142,11 +218,16 @@ forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum)
 	if (relsize_hash_size > 0)
 	{
 		RelTag		tag;
-
+		RelSizeEntry *entry;
 		tag.rinfo = rinfo;
 		tag.forknum = forknum;
 		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
-		hash_search(relsize_hash, &tag, HASH_REMOVE, NULL);
+		entry = hash_search(relsize_hash, &tag, HASH_REMOVE, NULL);
+		if (entry)
+		{
+			dlist_delete(&entry->lru_node);
+			relsize_ctl->size -= 1;
+		}
 		LWLockRelease(relsize_lock);
 	}
 }
@@ -191,7 +272,7 @@ relsize_shmem_request(void)
 	if (prev_shmem_request_hook)
 		prev_shmem_request_hook();

-	RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry)));
+	RequestAddinShmemSpace(sizeof(RelSizeHashControl) + hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry)));
 	RequestNamedLWLockTranche("neon_relsize", 1);
 }
 #endif
--- a/poetry.lock
+++ b/poetry.lock
@@ -158,28 +158,6 @@ files = [
 attrs = ">=16.0.0"
 pluggy = ">=0.4.0"

-[[package]]
-name = "anyio"
-version = "4.2.0"
-description = "High level compatibility layer for multiple asynchronous event loop implementations"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "anyio-4.2.0-py3-none-any.whl", hash = "sha256:745843b39e829e108e518c489b31dc757de7d2131d53fac32bd8df268227bfee"},
-    {file = "anyio-4.2.0.tar.gz", hash = "sha256:e1875bb4b4e2de1669f4bc7869b6d3f54231cdced71605e6e64c9be77e3be50f"},
-]
-
-[package.dependencies]
-exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
-idna = ">=2.8"
-sniffio = ">=1.1"
-typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""}
-
-[package.extras]
-doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"]
-test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"]
-trio = ["trio (>=0.23)"]
-
 [[package]]
 name = "async-timeout"
 version = "4.0.3"
@@ -1086,100 +1064,6 @@ files = [
    {file = "graphql_core-3.2.1-py3-none-any.whl", hash = "sha256:f83c658e4968998eed1923a2e3e3eddd347e005ac0315fbb7ca4d70ea9156323"},
 ]

-[[package]]
-name = "h11"
-version = "0.14.0"
-description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
-    {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
-]
-
-[[package]]
-name = "h2"
-version = "4.1.0"
-description = "HTTP/2 State-Machine based protocol implementation"
-optional = false
-python-versions = ">=3.6.1"
-files = [
-    {file = "h2-4.1.0-py3-none-any.whl", hash = "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d"},
-    {file = "h2-4.1.0.tar.gz", hash = "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"},
-]
-
-[package.dependencies]
-hpack = ">=4.0,<5"
-hyperframe = ">=6.0,<7"
-
-[[package]]
-name = "hpack"
-version = "4.0.0"
-description = "Pure-Python HPACK header compression"
-optional = false
-python-versions = ">=3.6.1"
-files = [
-    {file = "hpack-4.0.0-py3-none-any.whl", hash = "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c"},
-    {file = "hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"},
-]
-
-[[package]]
-name = "httpcore"
-version = "1.0.2"
-description = "A minimal low-level HTTP client."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "httpcore-1.0.2-py3-none-any.whl", hash = "sha256:096cc05bca73b8e459a1fc3dcf585148f63e534eae4339559c9b8a8d6399acc7"},
-    {file = "httpcore-1.0.2.tar.gz", hash = "sha256:9fc092e4799b26174648e54b74ed5f683132a464e95643b226e00c2ed2fa6535"},
-]
-
-[package.dependencies]
-certifi = "*"
-h11 = ">=0.13,<0.15"
-
-[package.extras]
-asyncio = ["anyio (>=4.0,<5.0)"]
-http2 = ["h2 (>=3,<5)"]
-socks = ["socksio (==1.*)"]
-trio = ["trio (>=0.22.0,<0.23.0)"]
-
-[[package]]
-name = "httpx"
-version = "0.26.0"
-description = "The next generation HTTP client."
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "httpx-0.26.0-py3-none-any.whl", hash = "sha256:8915f5a3627c4d47b73e8202457cb28f1266982d1159bd5779d86a80c0eab1cd"},
-    {file = "httpx-0.26.0.tar.gz", hash = "sha256:451b55c30d5185ea6b23c2c793abf9bb237d2a7dfb901ced6ff69ad37ec1dfaf"},
-]
-
-[package.dependencies]
-anyio = "*"
-certifi = "*"
-h2 = {version = ">=3,<5", optional = true, markers = "extra == \"http2\""}
-httpcore = "==1.*"
-idna = "*"
-sniffio = "*"
-
-[package.extras]
-brotli = ["brotli", "brotlicffi"]
-cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
-http2 = ["h2 (>=3,<5)"]
-socks = ["socksio (==1.*)"]
-
-[[package]]
-name = "hyperframe"
-version = "6.0.1"
-description = "HTTP/2 framing layer for Python"
-optional = false
-python-versions = ">=3.6.1"
-files = [
-    {file = "hyperframe-6.0.1-py3-none-any.whl", hash = "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15"},
-    {file = "hyperframe-6.0.1.tar.gz", hash = "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914"},
-]
-
 [[package]]
 name = "idna"
 version = "3.3"
@@ -2331,17 +2215,6 @@ files = [
    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
 ]

-[[package]]
-name = "sniffio"
-version = "1.3.0"
-description = "Sniff out which async library your code is running under"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"},
-    {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
-]
-
 [[package]]
 name = "sshpubkeys"
 version = "3.3.1"
@@ -2505,87 +2378,6 @@ docs = ["Sphinx (>=3.4)", "sphinx-rtd-theme (>=0.5)"]
 optional = ["python-socks", "wsaccel"]
 test = ["websockets"]

-[[package]]
-name = "websockets"
-version = "12.0"
-description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "websockets-12.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d554236b2a2006e0ce16315c16eaa0d628dab009c33b63ea03f41c6107958374"},
-    {file = "websockets-12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2d225bb6886591b1746b17c0573e29804619c8f755b5598d875bb4235ea639be"},
-    {file = "websockets-12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eb809e816916a3b210bed3c82fb88eaf16e8afcf9c115ebb2bacede1797d2547"},
-    {file = "websockets-12.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c588f6abc13f78a67044c6b1273a99e1cf31038ad51815b3b016ce699f0d75c2"},
-    {file = "websockets-12.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5aa9348186d79a5f232115ed3fa9020eab66d6c3437d72f9d2c8ac0c6858c558"},
-    {file = "websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6350b14a40c95ddd53e775dbdbbbc59b124a5c8ecd6fbb09c2e52029f7a9f480"},
-    {file = "websockets-12.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:70ec754cc2a769bcd218ed8d7209055667b30860ffecb8633a834dde27d6307c"},
-    {file = "websockets-12.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6e96f5ed1b83a8ddb07909b45bd94833b0710f738115751cdaa9da1fb0cb66e8"},
-    {file = "websockets-12.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4d87be612cbef86f994178d5186add3d94e9f31cc3cb499a0482b866ec477603"},
-    {file = "websockets-12.0-cp310-cp310-win32.whl", hash = "sha256:befe90632d66caaf72e8b2ed4d7f02b348913813c8b0a32fae1cc5fe3730902f"},
-    {file = "websockets-12.0-cp310-cp310-win_amd64.whl", hash = "sha256:363f57ca8bc8576195d0540c648aa58ac18cf85b76ad5202b9f976918f4219cf"},
-    {file = "websockets-12.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5d873c7de42dea355d73f170be0f23788cf3fa9f7bed718fd2830eefedce01b4"},
-    {file = "websockets-12.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3f61726cae9f65b872502ff3c1496abc93ffbe31b278455c418492016e2afc8f"},
-    {file = "websockets-12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ed2fcf7a07334c77fc8a230755c2209223a7cc44fc27597729b8ef5425aa61a3"},
-    {file = "websockets-12.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e332c210b14b57904869ca9f9bf4ca32f5427a03eeb625da9b616c85a3a506c"},
-    {file = "websockets-12.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5693ef74233122f8ebab026817b1b37fe25c411ecfca084b29bc7d6efc548f45"},
-    {file = "websockets-12.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e9e7db18b4539a29cc5ad8c8b252738a30e2b13f033c2d6e9d0549b45841c04"},
-    {file = "websockets-12.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6e2df67b8014767d0f785baa98393725739287684b9f8d8a1001eb2839031447"},
-    {file = "websockets-12.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:bea88d71630c5900690fcb03161ab18f8f244805c59e2e0dc4ffadae0a7ee0ca"},
-    {file = "websockets-12.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dff6cdf35e31d1315790149fee351f9e52978130cef6c87c4b6c9b3baf78bc53"},
-    {file = "websockets-12.0-cp311-cp311-win32.whl", hash = "sha256:3e3aa8c468af01d70332a382350ee95f6986db479ce7af14d5e81ec52aa2b402"},
-    {file = "websockets-12.0-cp311-cp311-win_amd64.whl", hash = "sha256:25eb766c8ad27da0f79420b2af4b85d29914ba0edf69f547cc4f06ca6f1d403b"},
-    {file = "websockets-12.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0e6e2711d5a8e6e482cacb927a49a3d432345dfe7dea8ace7b5790df5932e4df"},
-    {file = "websockets-12.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:dbcf72a37f0b3316e993e13ecf32f10c0e1259c28ffd0a85cee26e8549595fbc"},
-    {file = "websockets-12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:12743ab88ab2af1d17dd4acb4645677cb7063ef4db93abffbf164218a5d54c6b"},
-    {file = "websockets-12.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b645f491f3c48d3f8a00d1fce07445fab7347fec54a3e65f0725d730d5b99cb"},
-    {file = "websockets-12.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9893d1aa45a7f8b3bc4510f6ccf8db8c3b62120917af15e3de247f0780294b92"},
-    {file = "websockets-12.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f38a7b376117ef7aff996e737583172bdf535932c9ca021746573bce40165ed"},
-    {file = "websockets-12.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:f764ba54e33daf20e167915edc443b6f88956f37fb606449b4a5b10ba42235a5"},
-    {file = "websockets-12.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:1e4b3f8ea6a9cfa8be8484c9221ec0257508e3a1ec43c36acdefb2a9c3b00aa2"},
-    {file = "websockets-12.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9fdf06fd06c32205a07e47328ab49c40fc1407cdec801d698a7c41167ea45113"},
-    {file = "websockets-12.0-cp312-cp312-win32.whl", hash = "sha256:baa386875b70cbd81798fa9f71be689c1bf484f65fd6fb08d051a0ee4e79924d"},
-    {file = "websockets-12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ae0a5da8f35a5be197f328d4727dbcfafa53d1824fac3d96cdd3a642fe09394f"},
-    {file = "websockets-12.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5f6ffe2c6598f7f7207eef9a1228b6f5c818f9f4d53ee920aacd35cec8110438"},
-    {file = "websockets-12.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9edf3fc590cc2ec20dc9d7a45108b5bbaf21c0d89f9fd3fd1685e223771dc0b2"},
-    {file = "websockets-12.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8572132c7be52632201a35f5e08348137f658e5ffd21f51f94572ca6c05ea81d"},
-    {file = "websockets-12.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:604428d1b87edbf02b233e2c207d7d528460fa978f9e391bd8aaf9c8311de137"},
-    {file = "websockets-12.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1a9d160fd080c6285e202327aba140fc9a0d910b09e423afff4ae5cbbf1c7205"},
-    {file = "websockets-12.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87b4aafed34653e465eb77b7c93ef058516cb5acf3eb21e42f33928616172def"},
-    {file = "websockets-12.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b2ee7288b85959797970114deae81ab41b731f19ebcd3bd499ae9ca0e3f1d2c8"},
-    {file = "websockets-12.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:7fa3d25e81bfe6a89718e9791128398a50dec6d57faf23770787ff441d851967"},
-    {file = "websockets-12.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a571f035a47212288e3b3519944f6bf4ac7bc7553243e41eac50dd48552b6df7"},
-    {file = "websockets-12.0-cp38-cp38-win32.whl", hash = "sha256:3c6cc1360c10c17463aadd29dd3af332d4a1adaa8796f6b0e9f9df1fdb0bad62"},
-    {file = "websockets-12.0-cp38-cp38-win_amd64.whl", hash = "sha256:1bf386089178ea69d720f8db6199a0504a406209a0fc23e603b27b300fdd6892"},
-    {file = "websockets-12.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ab3d732ad50a4fbd04a4490ef08acd0517b6ae6b77eb967251f4c263011a990d"},
-    {file = "websockets-12.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a1d9697f3337a89691e3bd8dc56dea45a6f6d975f92e7d5f773bc715c15dde28"},
-    {file = "websockets-12.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1df2fbd2c8a98d38a66f5238484405b8d1d16f929bb7a33ed73e4801222a6f53"},
-    {file = "websockets-12.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23509452b3bc38e3a057382c2e941d5ac2e01e251acce7adc74011d7d8de434c"},
-    {file = "websockets-12.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2e5fc14ec6ea568200ea4ef46545073da81900a2b67b3e666f04adf53ad452ec"},
-    {file = "websockets-12.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46e71dbbd12850224243f5d2aeec90f0aaa0f2dde5aeeb8fc8df21e04d99eff9"},
-    {file = "websockets-12.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b81f90dcc6c85a9b7f29873beb56c94c85d6f0dac2ea8b60d995bd18bf3e2aae"},
-    {file = "websockets-12.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:a02413bc474feda2849c59ed2dfb2cddb4cd3d2f03a2fedec51d6e959d9b608b"},
-    {file = "websockets-12.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:bbe6013f9f791944ed31ca08b077e26249309639313fff132bfbf3ba105673b9"},
-    {file = "websockets-12.0-cp39-cp39-win32.whl", hash = "sha256:cbe83a6bbdf207ff0541de01e11904827540aa069293696dd528a6640bd6a5f6"},
-    {file = "websockets-12.0-cp39-cp39-win_amd64.whl", hash = "sha256:fc4e7fa5414512b481a2483775a8e8be7803a35b30ca805afa4998a84f9fd9e8"},
-    {file = "websockets-12.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:248d8e2446e13c1d4326e0a6a4e9629cb13a11195051a73acf414812700badbd"},
-    {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f44069528d45a933997a6fef143030d8ca8042f0dfaad753e2906398290e2870"},
-    {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c4e37d36f0d19f0a4413d3e18c0d03d0c268ada2061868c1e6f5ab1a6d575077"},
-    {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d829f975fc2e527a3ef2f9c8f25e553eb7bc779c6665e8e1d52aa22800bb38b"},
-    {file = "websockets-12.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:2c71bd45a777433dd9113847af751aae36e448bc6b8c361a566cb043eda6ec30"},
-    {file = "websockets-12.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0bee75f400895aef54157b36ed6d3b308fcab62e5260703add87f44cee9c82a6"},
-    {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:423fc1ed29f7512fceb727e2d2aecb952c46aa34895e9ed96071821309951123"},
-    {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27a5e9964ef509016759f2ef3f2c1e13f403725a5e6a1775555994966a66e931"},
-    {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3181df4583c4d3994d31fb235dc681d2aaad744fbdbf94c4802485ececdecf2"},
-    {file = "websockets-12.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b067cb952ce8bf40115f6c19f478dc71c5e719b7fbaa511359795dfd9d1a6468"},
-    {file = "websockets-12.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:00700340c6c7ab788f176d118775202aadea7602c5cc6be6ae127761c16d6b0b"},
-    {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e469d01137942849cff40517c97a30a93ae79917752b34029f0ec72df6b46399"},
-    {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffefa1374cd508d633646d51a8e9277763a9b78ae71324183693959cf94635a7"},
-    {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba0cab91b3956dfa9f512147860783a1829a8d905ee218a9837c18f683239611"},
-    {file = "websockets-12.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2cb388a5bfb56df4d9a406783b7f9dbefb888c09b71629351cc6b036e9259370"},
-    {file = "websockets-12.0-py3-none-any.whl", hash = "sha256:dc284bbc8d7c78a6c69e0c7325ab46ee5e40bb4d50e494d8131a07ef47500e9e"},
-    {file = "websockets-12.0.tar.gz", hash = "sha256:81df9cbcbb6c260de1e007e58c011bfebe2dafc8435107b0537f393dd38c8b1b"},
-]
-
 [[package]]
 name = "werkzeug"
 version = "3.0.1"
@@ -2629,6 +2421,16 @@ files = [
    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
    {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
    {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
+    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
+    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2866,4 +2668,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "f750bd06f1937f0614204e0ffe9a293eb61a0d7d675a80d5849f40a22745b5f9"
+content-hash = "9cf2734cafd5b6963165d398f1b24621193d5284d0bc7cc26a720a014f523860"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -27,7 +27,12 @@ hex.workspace = true
 hmac.workspace = true
 hostname.workspace = true
 humantime.workspace = true
-hyper.workspace = true
+hyper-tungstenite.workspace = true
+hyper = { workspace = true, features = ["server"] }
+hyper-util = { workspace = true, features = ["tokio", "server", "server-auto"] }
+http = { workspace = true, features = [] }
+http-body = { workspace = true, features = [] }
+http-body-util = { workspace = true, features = [] }
 ipnet.workspace = true
 itertools.workspace = true
 md5.workspace = true
@@ -65,13 +70,11 @@ tls-listener.workspace = true
 tokio-postgres.workspace = true
 tokio-rustls.workspace = true
 tokio-util.workspace = true
-tokio-tungstenite.workspace = true
 tokio = { workspace = true, features = ["signal"] }
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
-tungstenite.workspace = true
 url.workspace = true
 utils.workspace = true
 uuid.workspace = true
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -4,14 +4,12 @@

 pub mod health_server;

-use std::{sync::Arc, time::Duration};
+use std::time::Duration;

-use futures::FutureExt;
 pub use reqwest::{Request, Response, StatusCode};
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio::time::Instant;
-use tracing::trace;

 use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl};
 use reqwest_middleware::RequestBuilder;
@@ -21,8 +19,6 @@ use reqwest_middleware::RequestBuilder;
 /// We deliberately don't want to replace this with a public static.
 pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> ClientWithMiddleware {
    let client = reqwest::ClientBuilder::new()
-        .dns_resolver(Arc::new(GaiResolver::default()))
-        .connection_verbose(true)
        .build()
        .expect("Failed to create http client");

@@ -34,8 +30,6 @@ pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> Clien

 pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
    let timeout_client = reqwest::ClientBuilder::new()
-        .dns_resolver(Arc::new(GaiResolver::default()))
-        .connection_verbose(true)
        .timeout(default_timout)
        .build()
        .expect("Failed to create http client with timeout");
@@ -100,37 +94,6 @@ impl Endpoint {
    }
 }

-/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
-use hyper::{
-    client::connect::dns::{GaiResolver as HyperGaiResolver, Name},
-    service::Service,
-};
-use reqwest::dns::{Addrs, Resolve, Resolving};
-#[derive(Debug)]
-pub struct GaiResolver(HyperGaiResolver);
-
-impl Default for GaiResolver {
-    fn default() -> Self {
-        Self(HyperGaiResolver::new())
-    }
-}
-
-impl Resolve for GaiResolver {
-    fn resolve(&self, name: Name) -> Resolving {
-        let this = &mut self.0.clone();
-        let start = Instant::now();
-        Box::pin(
-            Service::<Name>::call(this, name.clone()).map(move |result| {
-                let resolve_duration = start.elapsed();
-                trace!(duration = ?resolve_duration, addr = %name, "resolve host complete");
-                result
-                    .map(|addrs| -> Addrs { Box::new(addrs) })
-                    .map_err(|err| -> Box<dyn std::error::Error + Send + Sync> { Box::new(err) })
-            }),
-        )
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
@@ -1,14 +1,21 @@
-use anyhow::{anyhow, bail};
-use hyper::{Body, Request, Response, StatusCode};
+use anyhow::anyhow;
+use http::{Request, Response};
+use hyper::StatusCode;
+use hyper_util::{
+    rt::{TokioExecutor, TokioIo},
+    server::conn,
+};
 use std::{convert::Infallible, net::TcpListener};
 use tracing::info;
-use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService};
+use utils::http::{
+    endpoint, error::ApiError, json::json_response, Body, RequestServiceBuilder, RouterBuilder,
+};

 async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
-    json_response(StatusCode::OK, "")
+    json_response(StatusCode::OK, "").map(|req| req.map(Body::new))
 }

-fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
+fn make_router() -> RouterBuilder<Body, ApiError> {
    endpoint::make_router().get("/v1/status", status_handler)
 }

@@ -17,11 +24,20 @@ pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible>
        info!("http has shut down");
    }

-    let service = || RouterService::new(make_router().build()?);
+    let router = make_router().build().map_err(|e| anyhow!(e))?;
+    let builder = RequestServiceBuilder::new(router).map_err(|e| anyhow!(e))?;
+    let listener = tokio::net::TcpListener::from_std(http_listener)?;

-    hyper::Server::from_tcp(http_listener)?
-        .serve(service().map_err(|e| anyhow!(e))?)
-        .await?;
-
-    bail!("hyper server without shutdown handling cannot shutdown successfully");
+    loop {
+        let (stream, remote_addr) = listener.accept().await.unwrap();
+        let io = TokioIo::new(stream);
+        let service = builder.build(remote_addr);
+        tokio::task::spawn(async move {
+            let builder = conn::auto::Builder::new(TokioExecutor::new());
+            let res = builder.serve_connection(io, service).await;
+            if let Err(err) = res {
+                println!("Error serving connection: {:?}", err);
+            }
+        });
+    }
 }
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -10,13 +10,15 @@ use std::{
 };

 use bytes::{Buf, BytesMut};
-use hyper::server::conn::{AddrIncoming, AddrStream};
 use pin_project_lite::pin_project;
 use tls_listener::AsyncAccept;
-use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
+use tokio::{
+    io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf},
+    net::{TcpListener, TcpStream},
+};

 pub struct ProxyProtocolAccept {
-    pub incoming: AddrIncoming,
+    pub incoming: TcpListener,
 }

 pin_project! {
@@ -327,20 +329,18 @@ impl<T: AsyncRead> AsyncRead for WithClientIp<T> {
 }

 impl AsyncAccept for ProxyProtocolAccept {
-    type Connection = WithClientIp<AddrStream>;
+    type Connection = WithClientIp<TcpStream>;

+    type Address = SocketAddr;
    type Error = io::Error;

    fn poll_accept(
        mut self: Pin<&mut Self>,
        cx: &mut Context<'_>,
-    ) -> Poll<Option<Result<Self::Connection, Self::Error>>> {
-        let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?);
-        let Some(conn) = conn else {
-            return Poll::Ready(None);
-        };
-
-        Poll::Ready(Some(Ok(WithClientIp::new(conn))))
+    ) -> Poll<Result<(Self::Connection, Self::Address), Self::Error>> {
+        Pin::new(&mut self.incoming)
+            .poll_accept(cx)
+            .map_ok(|(c, a)| (WithClientIp::new(c), a))
    }
 }

--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -7,8 +7,8 @@ use crate::{
    proxy::retry::{retry_after, ShouldRetry},
 };
 use async_trait::async_trait;
-use hyper::StatusCode;
 use pq_proto::StartupMessageParams;
+use reqwest::StatusCode;
 use std::ops::ControlFlow;
 use tokio::time;
 use tracing::{error, info, warn};
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -6,41 +6,54 @@ mod conn_pool;
 mod sql_over_http;
 mod websocket;

+use bytes::Bytes;
 pub use conn_pool::GlobalConnPoolOptions;

-use anyhow::bail;
+use http_body_util::Full;
+use hyper::body::Incoming;
 use hyper::StatusCode;
+use hyper_util::rt::{TokioExecutor, TokioIo};
+use hyper_util::server::conn;
 use metrics::IntCounterPairGuard;
 use rand::rngs::StdRng;
 use rand::SeedableRng;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::select;
+use tokio_rustls::TlsAcceptor;
 use tokio_util::task::TaskTracker;

 use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
 use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
-use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
+use crate::protocol2::ProxyProtocolAccept;
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::{cancellation::CancelMap, config::ProxyConfig};
-use futures::StreamExt;
-use hyper::{
-    server::{
-        accept,
-        conn::{AddrIncoming, AddrStream},
-    },
-    Body, Method, Request, Response,
-};
+use hyper::{Method, Request, Response};

 use std::net::IpAddr;
-use std::task::Poll;
-use std::{future::ready, sync::Arc};
-use tls_listener::TlsListener;
+use std::pin::pin;
+use std::sync::Arc;
+use tls_listener::{AsyncTls, TlsListener};
 use tokio::net::TcpListener;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
 use utils::http::{error::ApiError, json::json_response};

+#[derive(Clone)]
+struct Tls(TlsAcceptor);
+
+impl<C: AsyncRead + AsyncWrite + Unpin> AsyncTls<C> for Tls {
+    type Stream = tokio_rustls::server::TlsStream<C>;
+    type Error = std::io::Error;
+    type AcceptFuture = tokio_rustls::Accept<C>;
+
+    fn accept(&self, conn: C) -> Self::AcceptFuture {
+        tokio_rustls::TlsAcceptor::accept(&self.0, conn)
+    }
+}
+
 pub async fn task_main(
    config: &'static ProxyConfig,
    ws_listener: TcpListener,
@@ -77,53 +90,55 @@ pub async fn task_main(
            return Ok(());
        }
    };
+    let tls_acceptor: tokio_rustls::TlsAcceptor = tls_config.to_server_config().into();

-    let mut tls_server_config = rustls::ServerConfig::clone(&tls_config.to_server_config());
-    // prefer http2, but support http/1.1
-    tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
-    let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into();
-
-    let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?;
-    let _ = addr_incoming.set_nodelay(true);
+    // let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?;
+    // let _ = addr_incoming.set_nodelay(true);
    let addr_incoming = ProxyProtocolAccept {
-        incoming: addr_incoming,
+        incoming: ws_listener,
    };

    let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
+    let ws_connections2 = ws_connections.clone();
    ws_connections.close(); // allows `ws_connections.wait to complete`

-    let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
-        if let Err(err) = conn {
-            error!("failed to accept TLS connection for websockets: {err:?}");
-            ready(false)
-        } else {
-            ready(true)
-        }
-    });
+    let mut tls_listener = TlsListener::new(Tls(tls_acceptor), addr_incoming);

-    let make_svc = hyper::service::make_service_fn(
-        |stream: &tokio_rustls::server::TlsStream<WithClientIp<AddrStream>>| {
+    tokio::spawn(async move {
+        loop {
+            let (stream, remote_addr) = select! {
+                res = tls_listener.accept() => {
+                    match res {
+                        Err(err) =>
+                        {error!("failed to accept TLS connection for websockets: {err:?}"); continue},
+                        Ok(s) => s,
+                    }
+                }
+                _ = cancellation_token.cancelled() => break,
+            };
            let (io, tls) = stream.get_ref();
            let client_addr = io.client_addr();
-            let remote_addr = io.inner.remote_addr();
            let sni_name = tls.server_name().map(|s| s.to_string());
-            let protocol = tls
-                .alpn_protocol()
-                .map(|s| String::from_utf8_lossy(s).into_owned());
            let conn_pool = conn_pool.clone();
-            let ws_connections = ws_connections.clone();
+            let ws_connections = ws_connections2.clone();
            let endpoint_rate_limiter = endpoint_rate_limiter.clone();

-            async move {
-                let peer_addr = match client_addr {
-                    Some(addr) => addr,
-                    None if config.require_client_ip => bail!("missing required client ip"),
-                    None => remote_addr,
-                };
-                Ok(MetricService::new(hyper::service::service_fn(
-                    move |req: Request<Body>| {
+            let peer_addr = match client_addr {
+                Some(addr) => addr,
+                None if config.require_client_ip => {
+                    tracing::error!("Error serving connection: missing required client ip");
+                    continue;
+                }
+                None => remote_addr,
+            };
+
+            let io = TokioIo::new(stream);
+
+            let cancellation_token = cancellation_token.clone();
+            tokio::task::spawn(async move {
+                let service = MetricService::new(hyper::service::service_fn(
+                    move |req: Request<Incoming>| {
                        let sni_name = sni_name.clone();
-                        let protocol = protocol.clone();
                        let conn_pool = conn_pool.clone();
                        let ws_connections = ws_connections.clone();
                        let endpoint_rate_limiter = endpoint_rate_limiter.clone();
@@ -148,21 +163,26 @@ pub async fn task_main(
                                "serverless",
                                session = %session_id,
                                %peer_addr,
-                                http_protocol = ?protocol,
                            ))
                            .await
                        }
                    },
-                )))
-            }
-        },
-    );
-
-    hyper::Server::builder(accept::from_stream(tls_listener))
-        .http2_enable_connect_protocol()
-        .serve(make_svc)
-        .with_graceful_shutdown(cancellation_token.cancelled())
-        .await?;
+                ));
+                let builder = conn::auto::Builder::new(TokioExecutor::new());
+                let mut conn = pin!(builder.serve_connection(io, service));
+                let res = select! {
+                    _ = cancellation_token.cancelled() => {
+                        conn.as_mut().graceful_shutdown();
+                        conn.await
+                    }
+                    res = conn.as_mut() => res,
+                };
+                if let Err(err) = res {
+                    tracing::error!("Error serving connection: {:?}", err);
+                }
+            });
+        }
+    });

    // await websocket connections
    ws_connections.wait().await;
@@ -194,18 +214,14 @@ where
    type Error = S::Error;
    type Future = S::Future;

-    fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
-        self.inner.poll_ready(cx)
-    }
-
-    fn call(&mut self, req: Request<ReqBody>) -> Self::Future {
+    fn call(&self, req: Request<ReqBody>) -> Self::Future {
        self.inner.call(req)
    }
 }

 #[allow(clippy::too_many_arguments)]
 async fn request_handler(
-    mut request: Request<Body>,
+    mut request: Request<Incoming>,
    config: &'static ProxyConfig,
    tls: &'static TlsConfig,
    conn_pool: Arc<conn_pool::GlobalConnPool>,
@@ -215,7 +231,7 @@ async fn request_handler(
    sni_hostname: Option<String>,
    peer_addr: IpAddr,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-) -> Result<Response<Body>, ApiError> {
+) -> Result<Response<Full<Bytes>>, ApiError> {
    let host = request
        .headers()
        .get("host")
@@ -223,13 +239,11 @@ async fn request_handler(
        .and_then(|h| h.split(':').next())
        .map(|s| s.to_string());

-    let ws_config = None;
-
    // Check if the request is a websocket upgrade request.
-    if websocket::is_upgrade_request(&request) {
+    if hyper_tungstenite::is_upgrade_request(&request) {
        info!(session_id = ?session_id, "performing websocket upgrade");

-        let (response, websocket) = websocket::upgrade(&mut request, ws_config)
+        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
            .map_err(|e| ApiError::BadRequest(e.into()))?;

        ws_connections.spawn(
@@ -252,34 +266,6 @@ async fn request_handler(
            .in_current_span(),
        );

-        // Return the response so the spawned future can continue.
-        Ok(response)
-    } else if websocket::is_connect_request(&request) {
-        info!(session_id = ?session_id, "performing http2 websocket upgrade");
-
-        let (response, websocket) = websocket::connect(&mut request, ws_config)
-            .map_err(|e| ApiError::BadRequest(e.into()))?;
-
-        ws_connections.spawn(
-            async move {
-                let mut ctx = RequestMonitoring::new(session_id, peer_addr, "ws2", &config.region);
-
-                if let Err(e) = websocket::serve_websocket(
-                    config,
-                    &mut ctx,
-                    websocket,
-                    &cancel_map,
-                    host,
-                    endpoint_rate_limiter,
-                )
-                .await
-                {
-                    error!(session_id = ?session_id, "error in http2 websocket connection: {e:#}");
-                }
-            }
-            .in_current_span(),
-        );
-
        // Return the response so the spawned future can continue.
        Ok(response)
    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
@@ -296,7 +282,7 @@ async fn request_handler(
        .await
    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
        Response::builder()
-            .header("Allow", "OPTIONS, POST, CONNECT")
+            .header("Allow", "OPTIONS, POST")
            .header("Access-Control-Allow-Origin", "*")
            .header(
                "Access-Control-Allow-Headers",
@@ -304,7 +290,7 @@ async fn request_handler(
            )
            .header("Access-Control-Max-Age", "86400" /* 24 hours */)
            .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
-            .body(Body::empty())
+            .body(Full::new(Bytes::new()))
            .map_err(|e| ApiError::InternalServerError(e.into()))
    } else {
        json_response(StatusCode::BAD_REQUEST, "query is not supported")
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,20 +1,26 @@
 use std::sync::Arc;

 use anyhow::bail;
+use bytes::Buf;
+use bytes::Bytes;
 use futures::pin_mut;
 use futures::StreamExt;
-use hyper::body::HttpBody;
+use http_body::Body;
+use http_body_util::BodyExt;
+use http_body_util::Full;
+use hyper::body::Incoming;
 use hyper::header;
 use hyper::http::HeaderName;
 use hyper::http::HeaderValue;
 use hyper::Response;
 use hyper::StatusCode;
-use hyper::{Body, HeaderMap, Request};
+use hyper::{HeaderMap, Request};
 use serde_json::json;
 use serde_json::Map;
 use serde_json::Value;
 use smol_str::SmolStr;
 use tokio_postgres::error::DbError;
+use tokio_postgres::error::ErrorPosition;
 use tokio_postgres::types::Kind;
 use tokio_postgres::types::Type;
 use tokio_postgres::GenericClient;
@@ -59,6 +65,7 @@ enum Payload {

 const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB
 const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB
+const SERVERLESS_DRIVER_SNI_HOSTNAME_FIRST_PART: &str = "api";

 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
@@ -176,10 +183,11 @@ fn get_conn_info(
        .and_then(|h| h.to_str().ok())
        .and_then(|h| h.split(':').next());

-    if hostname != sni_hostname {
+    // sni_hostname has to be either the same as hostname or the one used in serverless driver.
+    if !check_matches(&sni_hostname, hostname)? {
        return Err(anyhow::anyhow!("mismatched SNI hostname and hostname"));
    } else if let Some(h) = host_header {
-        if h != hostname {
+        if h != sni_hostname {
            return Err(anyhow::anyhow!("mismatched host header and hostname"));
        }
    }
@@ -213,15 +221,29 @@ fn get_conn_info(
    })
 }

+fn check_matches(sni_hostname: &str, hostname: &str) -> Result<bool, anyhow::Error> {
+    if sni_hostname == hostname {
+        return Ok(true);
+    }
+    let (sni_hostname_first, sni_hostname_rest) = sni_hostname
+        .split_once('.')
+        .ok_or_else(|| anyhow::anyhow!("Unexpected sni format."))?;
+    let (_, hostname_rest) = hostname
+        .split_once('.')
+        .ok_or_else(|| anyhow::anyhow!("Unexpected hostname format."))?;
+    Ok(sni_hostname_rest == hostname_rest
+        && sni_hostname_first == SERVERLESS_DRIVER_SNI_HOSTNAME_FIRST_PART)
+}
+
 // TODO: return different http error codes
 pub async fn handle(
    tls: &'static TlsConfig,
    config: &'static HttpConfig,
    ctx: &mut RequestMonitoring,
-    request: Request<Body>,
+    request: Request<Incoming>,
    sni_hostname: Option<String>,
    conn_pool: Arc<GlobalConnPool>,
-) -> Result<Response<Body>, ApiError> {
+) -> Result<Response<Full<Bytes>>, ApiError> {
    let result = tokio::time::timeout(
        config.request_timeout,
        handle_inner(tls, config, ctx, request, sni_hostname, conn_pool),
@@ -231,7 +253,7 @@ pub async fn handle(
        Ok(r) => match r {
            Ok(r) => r,
            Err(e) => {
-                let message = format!("{:?}", e);
+                let mut message = format!("{:?}", e);
                let db_error = e
                    .downcast_ref::<tokio_postgres::Error>()
                    .and_then(|e| e.as_db_error());
@@ -244,7 +266,25 @@ pub async fn handle(
                        .unwrap_or_default()
                }

-                // TODO(conrad): db_error.position()
+                if let Some(db_error) = db_error {
+                    db_error.message().clone_into(&mut message);
+                }
+
+                let position = db_error.and_then(|db| db.position());
+                let (position, internal_position, internal_query) = match position {
+                    Some(ErrorPosition::Original(position)) => (
+                        Value::String(position.to_string()),
+                        Value::Null,
+                        Value::Null,
+                    ),
+                    Some(ErrorPosition::Internal { position, query }) => (
+                        Value::Null,
+                        Value::String(position.to_string()),
+                        Value::String(query.clone()),
+                    ),
+                    None => (Value::Null, Value::Null, Value::Null),
+                };
+
                let code = get(db_error, |db| db.code().code());
                let severity = get(db_error, |db| db.severity());
                let detail = get(db_error, |db| db.detail());
@@ -256,7 +296,7 @@ pub async fn handle(
                let datatype = get(db_error, |db| db.datatype());
                let constraint = get(db_error, |db| db.constraint());
                let file = get(db_error, |db| db.file());
-                let line = get(db_error, |db| db.line());
+                let line = get(db_error, |db| db.line().map(|l| l.to_string()));
                let routine = get(db_error, |db| db.routine());

                error!(
@@ -271,12 +311,15 @@ pub async fn handle(
                        "code": code,
                        "detail": detail,
                        "hint": hint,
+                        "position": position,
+                        "internalPosition": internal_position,
+                        "internalQuery": internal_query,
                        "severity": severity,
                        "where": where_,
                        "table": table,
                        "column": column,
                        "schema": schema,
-                        "datatype": datatype,
+                        "dataType": datatype,
                        "constraint": constraint,
                        "file": file,
                        "line": line,
@@ -309,10 +352,10 @@ async fn handle_inner(
    tls: &'static TlsConfig,
    config: &'static HttpConfig,
    ctx: &mut RequestMonitoring,
-    request: Request<Body>,
+    request: Request<Incoming>,
    sni_hostname: Option<String>,
    conn_pool: Arc<GlobalConnPool>,
-) -> anyhow::Result<Response<Body>> {
+) -> anyhow::Result<Response<Full<Bytes>>> {
    let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
        .with_label_values(&["http"])
        .guard();
@@ -368,8 +411,8 @@ async fn handle_inner(
    //
    // Read the query and query params from the request body
    //
-    let body = hyper::body::to_bytes(request.into_body()).await?;
-    let payload: Payload = serde_json::from_slice(&body)?;
+    let body = request.into_body().collect().await?.aggregate().reader();
+    let payload: Payload = serde_json::from_reader(body)?;

    let mut client = conn_pool.get(ctx, conn_info, !allow_pool).await?;

@@ -466,7 +509,7 @@ async fn handle_inner(
    let body = serde_json::to_string(&result).expect("json serialization should not fail");
    let len = body.len();
    let response = response
-        .body(Body::from(body))
+        .body(Full::from(body))
        // only fails if invalid status code or invalid header/values are given.
        // these are not user configurable so it cannot fail dynamically
        .expect("building response payload should not fail");
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -8,15 +8,9 @@ use crate::{
 };
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream};
-use hyper::{ext::Protocol, upgrade::Upgraded, Body, Method, Request, Response};
+use hyper::upgrade::Upgraded;
+use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
 use pin_project_lite::pin_project;
-use tokio_tungstenite::WebSocketStream;
-use tungstenite::{
-    error::{Error as WSError, ProtocolError},
-    handshake::derive_accept_key,
-    protocol::{Role, WebSocketConfig},
-    Message,
-};

 use std::{
    pin::Pin,
@@ -156,202 +150,19 @@ pub async fn serve_websocket(
    Ok(())
 }

-/// Try to upgrade a received `hyper::Request` to a websocket connection.
-///
-/// The function returns a HTTP response and a future that resolves to the websocket stream.
-/// The response body *MUST* be sent to the client before the future can be resolved.
-///
-/// This functions checks `Sec-WebSocket-Key` and `Sec-WebSocket-Version` headers.
-/// It does not inspect the `Origin`, `Sec-WebSocket-Protocol` or `Sec-WebSocket-Extensions` headers.
-/// You can inspect the headers manually before calling this function,
-/// and modify the response headers appropriately.
-///
-/// This function also does not look at the `Connection` or `Upgrade` headers.
-/// To check if a request is a websocket upgrade request, you can use [`is_upgrade_request`].
-/// Alternatively you can inspect the `Connection` and `Upgrade` headers manually.
-///
-pub fn upgrade<B>(
-    mut request: impl std::borrow::BorrowMut<Request<B>>,
-    config: Option<WebSocketConfig>,
-) -> Result<(Response<Body>, HyperWebsocket), ProtocolError> {
-    let request = request.borrow_mut();
-
-    let key = request
-        .headers()
-        .get("Sec-WebSocket-Key")
-        .ok_or(ProtocolError::MissingSecWebSocketKey)?;
-    if request
-        .headers()
-        .get("Sec-WebSocket-Version")
-        .map(|v| v.as_bytes())
-        != Some(b"13")
-    {
-        return Err(ProtocolError::MissingSecWebSocketVersionHeader);
-    }
-
-    let response = Response::builder()
-        .status(hyper::StatusCode::SWITCHING_PROTOCOLS)
-        .header(hyper::header::CONNECTION, "upgrade")
-        .header(hyper::header::UPGRADE, "websocket")
-        .header("Sec-WebSocket-Accept", &derive_accept_key(key.as_bytes()))
-        .body(Body::from("switching to websocket protocol"))
-        .expect("bug: failed to build response");
-
-    let stream = HyperWebsocket {
-        inner: hyper::upgrade::on(request),
-        config,
-    };
-
-    Ok((response, stream))
-}
-
-/// Check if a request is a websocket upgrade request.
-///
-/// If the `Upgrade` header lists multiple protocols,
-/// this function returns true if of them are `"websocket"`,
-/// If the server supports multiple upgrade protocols,
-/// it would be more appropriate to try each listed protocol in order.
-pub fn is_upgrade_request<B>(request: &hyper::Request<B>) -> bool {
-    header_contains_value(request.headers(), hyper::header::CONNECTION, "Upgrade")
-        && header_contains_value(request.headers(), hyper::header::UPGRADE, "websocket")
-}
-
-/// Check if there is a header of the given name containing the wanted value.
-fn header_contains_value(
-    headers: &hyper::HeaderMap,
-    header: impl hyper::header::AsHeaderName,
-    value: impl AsRef<[u8]>,
-) -> bool {
-    let value = value.as_ref();
-    for header in headers.get_all(header) {
-        if header
-            .as_bytes()
-            .split(|&c| c == b',')
-            .any(|x| trim(x).eq_ignore_ascii_case(value))
-        {
-            return true;
-        }
-    }
-    false
-}
-
-fn trim(data: &[u8]) -> &[u8] {
-    trim_end(trim_start(data))
-}
-
-fn trim_start(data: &[u8]) -> &[u8] {
-    if let Some(start) = data.iter().position(|x| !x.is_ascii_whitespace()) {
-        &data[start..]
-    } else {
-        b""
-    }
-}
-
-fn trim_end(data: &[u8]) -> &[u8] {
-    if let Some(last) = data.iter().rposition(|x| !x.is_ascii_whitespace()) {
-        &data[..last + 1]
-    } else {
-        b""
-    }
-}
-
-/// Try to upgrade a received `hyper::Request` to a websocket connection.
-///
-/// The function returns a HTTP response and a future that resolves to the websocket stream.
-/// The response body *MUST* be sent to the client before the future can be resolved.
-///
-/// This functions checks `Sec-WebSocket-Version` header.
-/// It does not inspect the `Origin`, `Sec-WebSocket-Protocol` or `Sec-WebSocket-Extensions` headers.
-/// You can inspect the headers manually before calling this function,
-/// and modify the response headers appropriately.
-///
-/// This function also does not look at the `Connection` or `Upgrade` headers.
-/// To check if a request is a websocket connect request, you can use [`is_connect_request`].
-/// Alternatively you can inspect the `Connection` and `Upgrade` headers manually.
-///
-pub fn connect<B>(
-    mut request: impl std::borrow::BorrowMut<Request<B>>,
-    config: Option<WebSocketConfig>,
-) -> Result<(Response<Body>, HyperWebsocket), ProtocolError> {
-    let request = request.borrow_mut();
-
-    if request
-        .headers()
-        .get("Sec-WebSocket-Version")
-        .map(|v| v.as_bytes())
-        != Some(b"13")
-    {
-        return Err(ProtocolError::MissingSecWebSocketVersionHeader);
-    }
-
-    let response = Response::builder()
-        .status(hyper::StatusCode::OK)
-        .body(Body::from("switching to websocket protocol"))
-        .expect("bug: failed to build response");
-
-    let stream = HyperWebsocket {
-        inner: hyper::upgrade::on(request),
-        config,
-    };
-
-    Ok((response, stream))
-}
-
-/// Check if a request is a websocket connect request.
-pub fn is_connect_request<B>(request: &hyper::Request<B>) -> bool {
-    request.method() == Method::CONNECT
-        && request
-            .extensions()
-            .get::<Protocol>()
-            .is_some_and(|protocol| protocol.as_str() == "websocket")
-}
-
-pin_project_lite::pin_project! {
-    /// A future that resolves to a websocket stream when the associated connection completes.
-    #[derive(Debug)]
-    pub struct HyperWebsocket {
-        #[pin]
-        inner: hyper::upgrade::OnUpgrade,
-        config: Option<WebSocketConfig>
-    }
-}
-
-impl std::future::Future for HyperWebsocket {
-    type Output = Result<WebSocketStream<hyper::upgrade::Upgraded>, WSError>;
-
-    fn poll(self: Pin<&mut Self>, cx: &mut std::task::Context) -> Poll<Self::Output> {
-        let this = self.project();
-        let upgraded = match this.inner.poll(cx) {
-            Poll::Pending => return Poll::Pending,
-            Poll::Ready(x) => x,
-        };
-
-        let upgraded =
-            upgraded.map_err(|_| WSError::Protocol(ProtocolError::HandshakeIncomplete))?;
-
-        let stream = WebSocketStream::from_raw_socket(upgraded, Role::Server, None);
-        tokio::pin!(stream);
-
-        // The future returned by `from_raw_socket` is always ready.
-        // Not sure why it is a future in the first place.
-        match stream.as_mut().poll(cx) {
-            Poll::Pending => unreachable!("from_raw_socket should always be created ready"),
-            Poll::Ready(x) => Poll::Ready(Ok(x)),
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use std::pin::pin;

    use futures::{SinkExt, StreamExt};
+    use hyper_tungstenite::{
+        tungstenite::{protocol::Role, Message},
+        WebSocketStream,
+    };
    use tokio::{
        io::{duplex, AsyncReadExt, AsyncWriteExt},
        task::JoinSet,
    };
-    use tokio_tungstenite::WebSocketStream;
-    use tungstenite::{protocol::Role, Message};

    use super::WebSocketRw;

--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -235,18 +235,19 @@ async fn collect_metrics_iteration(

 #[cfg(test)]
 mod tests {
-    use std::{
-        net::TcpListener,
-        sync::{Arc, Mutex},
-    };
+    use std::sync::{Arc, Mutex};

    use anyhow::Error;
+    use bytes::{Buf, Bytes};
    use chrono::Utc;
    use consumption_metrics::{Event, EventChunk};
-    use hyper::{
-        service::{make_service_fn, service_fn},
-        Body, Response,
+    use http_body_util::{BodyExt, Empty};
+    use hyper::{body::Incoming, service::service_fn, Response};
+    use hyper_util::{
+        rt::{TokioExecutor, TokioIo},
+        server::conn,
    };
+    use tokio::net::TcpListener;
    use url::Url;

    use super::{collect_metrics_iteration, Ids, Metrics};
@@ -254,30 +255,43 @@ mod tests {

    #[tokio::test]
    async fn metrics() {
-        let listener = TcpListener::bind("0.0.0.0:0").unwrap();
+        let listener = TcpListener::bind("0.0.0.0:0").await.unwrap();
+        let addr = listener.local_addr().unwrap();

        let reports = Arc::new(Mutex::new(vec![]));
        let reports2 = reports.clone();

-        let server = hyper::server::Server::from_tcp(listener)
-            .unwrap()
-            .serve(make_service_fn(move |_| {
-                let reports = reports.clone();
-                async move {
-                    Ok::<_, Error>(service_fn(move |req| {
-                        let reports = reports.clone();
-                        async move {
-                            let bytes = hyper::body::to_bytes(req.into_body()).await?;
-                            let events: EventChunk<'static, Event<Ids, String>> =
-                                serde_json::from_slice(&bytes)?;
-                            reports.lock().unwrap().push(events);
-                            Ok::<_, Error>(Response::new(Body::from(vec![])))
-                        }
-                    }))
-                }
-            }));
-        let addr = server.local_addr();
-        tokio::spawn(server);
+        let service = service_fn(move |req: hyper::Request<Incoming>| {
+            let reports = reports.clone();
+            async move {
+                let bytes = req
+                    .into_body()
+                    .collect()
+                    .await
+                    .unwrap()
+                    .aggregate()
+                    .reader();
+                let events: EventChunk<'static, Event<Ids, String>> =
+                    serde_json::from_reader(bytes)?;
+                reports.lock().unwrap().push(events);
+                Ok::<_, Error>(Response::new(Empty::<Bytes>::new()))
+            }
+        });
+
+        tokio::spawn(async move {
+            loop {
+                let (stream, _) = listener.accept().await.unwrap();
+                let io = TokioIo::new(stream);
+                let service = service.clone();
+                tokio::task::spawn(async move {
+                    let builder = conn::auto::Builder::new(TokioExecutor::new());
+                    let res = builder.serve_connection(io, service).await;
+                    if let Err(err) = res {
+                        println!("Error serving connection: {:?}", err);
+                    }
+                });
+            }
+        });

        let metrics = Metrics::default();
        let client = http::new_client(RateLimiterConfig::default());
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,8 +38,6 @@ pytest-rerunfailures = "^13.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"
 zstandard = "^0.21.0"
-websockets = "^12.0"
-httpx = {extras = ["http2"], version = "^0.26.0"}

 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
--- a/scripts/ps_ec2_setup_instance_store
+++ b/scripts/ps_ec2_setup_instance_store
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+
+# This script sets up an ext4 partition on an EC2 storage-optimized instance's instance store volume.
+# Unix permission/ownership is set to the calling user (the script does sudo internally.)
+#
+# It's intentionally not idempotent; don't take on that complexity in a bash script.
+
+set -euo pipefail
+set -x
+
+# This seems crude, but, apparently instance store NVMe volumes aren't exposed in the in instance metadata block-device-mapping.
+# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/block-device-mapping-concepts.html#bdm-instance-metadata
+if [ "$(cat /sys/class/block/nvme1n1/device/model)" != "Amazon EC2 NVMe Instance Storage        " ]; then
+    echo "nvme1n1 is not Amazon EC2 NVMe Instance Storage: '$(cat /sys/class/block/nvme1n1/device/model)'"
+    exit 1
+fi
+
+# NB: we DO NOT warm up all the blocks on the drive as recommended by https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/disk-performance.html
+# The reason is that we don't do that in production either.
+
+# do all the on-disk initialization work now instead of a background kernel thread
+# so that we're ready for benchmarking right after this line
+sudo mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0  /dev/nvme1n1
+
+MOUNTPOINT=/instance_store
+sudo mkdir "$MOUNTPOINT"
+sudo mount /dev/nvme1n1 "$MOUNTPOINT"
+sudo chown -R "$(id -u)":"$(id -g)" "$MOUNTPOINT"
+
+TEST_OUTPUT="$MOUNTPOINT/test_output"
+mkdir "$TEST_OUTPUT"
+
+NEON_REPO_DIR="$MOUNTPOINT/repo_dir"
+mkdir "$NEON_REPO_DIR"
+
+cat <<EOF
+SETUP COMPLETE
+
+To run your local neon.git build on the instance store volume,
+run the following commands from the top of the neon.git checkout
+
+    # test suite run
+    export TEST_OUTPUT="$TEST_OUTPUT"
+    DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py
+
+    # for interactive use
+    export NEON_REPO_DIR="$NEON_REPO_DIR"
+    cargo build_testing --release
+    ./target/release/neon_local init --force empty-dir-ok
+EOF
+
+
--- a/storage_broker/Cargo.toml
+++ b/storage_broker/Cargo.toml
@@ -18,7 +18,11 @@ futures-core.workspace = true
 futures-util.workspace = true
 git-version.workspace = true
 humantime.workspace = true
-hyper = { workspace = true, features = ["full"] }
+hyper = { workspace = true, features = ["server"] }
+hyper-util = { workspace = true, features = ["tokio", "server", "server-auto"] }
+http = { workspace = true, features = [] }
+http-body = { workspace = true, features = [] }
+http-body-util = { workspace = true, features = [] }
 once_cell.workspace = true
 parking_lot.workspace = true
 prost.workspace = true
@@ -29,6 +33,9 @@ tracing.workspace = true
 metrics.workspace = true
 utils.workspace = true

+# needed for tonic
+http0_2 = { package = "http", version = "0.2" }
+
 workspace_hack.workspace = true

 [build-dependencies]
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -13,10 +13,14 @@
 use clap::{command, Parser};
 use futures_core::Stream;
 use futures_util::StreamExt;
+use http::Request;
+use hyper::body::Incoming;
 use hyper::header::CONTENT_TYPE;
 use hyper::server::conn::AddrStream;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, StatusCode};
+use hyper_util::rt::{TokioExecutor, TokioIo};
+use hyper_util::server::conn;
 use parking_lot::RwLock;
 use std::collections::HashMap;
 use std::convert::Infallible;
@@ -24,6 +28,7 @@ use std::net::SocketAddr;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::time::Duration;
+use tokio::net::TcpListener;
 use tokio::sync::broadcast;
 use tokio::sync::broadcast::error::RecvError;
 use tokio::time;
@@ -596,9 +601,7 @@ impl BrokerService for Broker {
 }

 // We serve only metrics and healthcheck through http1.
-async fn http1_handler(
-    req: hyper::Request<hyper::body::Body>,
-) -> Result<hyper::Response<Body>, Infallible> {
+async fn http1_handler(req: hyper::Request<Body>) -> Result<hyper::Response<Body>, Infallible> {
    let resp = match (req.method(), req.uri().path()) {
        (&Method::GET, "/metrics") => {
            let mut buffer = vec![];
@@ -662,16 +665,19 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let storage_broker_server = BrokerServiceServer::new(storage_broker_impl);

    info!("listening on {}", &args.listen_addr);
+    let listener = TcpListener::bind(args.listen_addr).await?;

    // grpc is served along with http1 for metrics on a single port, hence we
    // don't use tonic's Server.
-    hyper::Server::bind(&args.listen_addr)
-        .http2_keep_alive_interval(Some(args.http2_keepalive_interval))
-        .serve(make_service_fn(move |conn: &AddrStream| {
+    loop {
+        let (stream, remote_addr) = listener.accept().await?;
+        let io = TokioIo::new(stream);
+
+        tokio::task::spawn(async move {
            let storage_broker_server_cloned = storage_broker_server.clone();
            let connect_info = conn.connect_info();
-            async move {
-                Ok::<_, Infallible>(service_fn(move |mut req| {
+            let service = async move {
+                Ok::<_, Infallible>(service_fn(move |mut req: Request<Incoming>| {
                    // That's what tonic's MakeSvc.call does to pass conninfo to
                    // the request handler (and where its request.remote_addr()
                    // expects it to find).
@@ -690,6 +696,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
                        if req.headers().get("content-type").map(|x| x.as_bytes())
                            == Some(b"application/grpc")
                        {
+                            // TODO: this doesn't work :(
+                            let (parts, body) = req.into_parts();
+                            let req = http0_2::Request::from_parts(parts, body);
                            let res_resp = storage_broker_server_svc.call(req).await;
                            // Grpc and http1 handlers have slightly different
                            // Response types: it is UnsyncBoxBody for the
@@ -703,10 +712,17 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
                        }
                    }
                }))
+            };
+
+            let builder = conn::auto::Builder::new(TokioExecutor::new())
+                .http2()
+                .keep_alive_interval(Some(args.http2_keepalive_interval));
+
+            if let Err(err) = builder.serve_connection(io, service).await {
+                tracing::error!("Error serving connection: {:?}", err);
            }
-        }))
-        .await?;
-    Ok(())
+        });
+    }
 }

 #[cfg(test)]
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -1,8 +1,6 @@
-use hyper::body::HttpBody;
-use std::pin::Pin;
-use std::task::{Context, Poll};
 use std::time::Duration;
 use tonic::codegen::StdError;
+pub use tonic::transport::Uri;
 use tonic::transport::{ClientTlsConfig, Endpoint};
 use tonic::{transport::Channel, Status};
 use utils::id::{TenantId, TenantTimelineId, TimelineId};
@@ -27,8 +25,6 @@ pub use tonic::Code;
 pub use tonic::Request;
 pub use tonic::Streaming;

-pub use hyper::Uri;
-
 pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051";
 pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}");

@@ -99,50 +95,7 @@ pub fn parse_proto_ttid(proto_ttid: &ProtoTenantTimelineId) -> Result<TenantTime
 // well.
 type AnyError = Box<dyn std::error::Error + Send + Sync + 'static>;

-// Provides impl HttpBody for two different types implementing it. Inspired by
-// https://github.com/hyperium/tonic/blob/master/examples/src/hyper_warp/server.rs
-pub enum EitherBody<A, B> {
-    Left(A),
-    Right(B),
-}
-
-impl<A, B> HttpBody for EitherBody<A, B>
-where
-    A: HttpBody + Send + Unpin,
-    B: HttpBody<Data = A::Data> + Send + Unpin,
-    A::Error: Into<AnyError>,
-    B::Error: Into<AnyError>,
-{
-    type Data = A::Data;
-    type Error = Box<dyn std::error::Error + Send + Sync + 'static>;
-
-    fn is_end_stream(&self) -> bool {
-        match self {
-            EitherBody::Left(b) => b.is_end_stream(),
-            EitherBody::Right(b) => b.is_end_stream(),
-        }
-    }
-
-    fn poll_data(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Result<Self::Data, Self::Error>>> {
-        match self.get_mut() {
-            EitherBody::Left(b) => Pin::new(b).poll_data(cx).map(map_option_err),
-            EitherBody::Right(b) => Pin::new(b).poll_data(cx).map(map_option_err),
-        }
-    }
-
-    fn poll_trailers(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Result<Option<hyper::HeaderMap>, Self::Error>> {
-        match self.get_mut() {
-            EitherBody::Left(b) => Pin::new(b).poll_trailers(cx).map_err(Into::into),
-            EitherBody::Right(b) => Pin::new(b).poll_trailers(cx).map_err(Into::into),
-        }
-    }
-}
+pub type EitherBody<L, R> = http_body_util::Either<L, R>;

 fn map_option_err<T, U: Into<AnyError>>(err: Option<Result<T, U>>) -> Option<Result<T, AnyError>> {
    err.map(|e| e.map_err(Into::into))
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -16,6 +16,7 @@ class Metrics:
    def query_all(self, name: str, filter: Optional[Dict[str, str]] = None) -> List[Sample]:
        filter = filter or {}
        res = []
+
        for sample in self.metrics[name]:
            try:
                if all(sample.labels[k] == v for k, v in filter.items()):
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -19,12 +19,11 @@ from functools import cached_property
 from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, cast
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
 from urllib.parse import urlparse

 import asyncpg
 import backoff
-import httpx
 import jwt
 import psycopg2
 import pytest
@@ -62,7 +61,7 @@ from fixtures.remote_storage import (
    default_remote_storage,
    remote_storage_to_toml_inline_table,
 )
-from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import (
    ATTACHMENT_NAME_REGEX,
    allure_add_grafana_links,
@@ -496,6 +495,8 @@ class NeonEnvBuilder:
        self,
        initial_tenant_conf: Optional[Dict[str, str]] = None,
        default_remote_storage_if_missing: bool = True,
+        initial_tenant_shard_count: Optional[int] = None,
+        initial_tenant_shard_stripe_size: Optional[int] = None,
    ) -> NeonEnv:
        """
        Default way to create and start NeonEnv. Also creates the initial_tenant with root initial_timeline.
@@ -513,7 +514,11 @@ class NeonEnvBuilder:
            f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
        )
        initial_tenant, initial_timeline = env.neon_cli.create_tenant(
-            tenant_id=env.initial_tenant, conf=initial_tenant_conf, timeline_id=env.initial_timeline
+            tenant_id=env.initial_tenant,
+            conf=initial_tenant_conf,
+            timeline_id=env.initial_timeline,
+            shard_count=initial_tenant_shard_count,
+            shard_stripe_size=initial_tenant_shard_stripe_size,
        )
        assert env.initial_tenant == initial_tenant
        assert env.initial_timeline == initial_timeline
@@ -862,7 +867,9 @@ class NeonEnv:

        attachment_service_port = self.port_distributor.get_port()
        self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}"
-        self.attachment_service: NeonAttachmentService = NeonAttachmentService(self)
+        self.attachment_service: NeonAttachmentService = NeonAttachmentService(
+            self, config.auth_enabled
+        )

        # Create a config file corresponding to the options
        cfg: Dict[str, Any] = {
@@ -984,6 +991,16 @@ class NeonEnv:

        raise RuntimeError(f"Pageserver with ID {id} not found")

+    def get_tenant_pageserver(self, tenant_id: Union[TenantId, TenantShardId]):
+        """
+        Get the NeonPageserver where this tenant shard is currently attached, according
+        to the attachment service.
+        """
+        meta = self.attachment_service.inspect(tenant_id)
+        assert meta is not None, f"{tenant_id} attachment location not found"
+        pageserver_id = meta[1]
+        return self.get_pageserver(pageserver_id)
+
    def get_safekeeper_connstrs(self) -> str:
        """Get list of safekeeper endpoints suitable for safekeepers GUC"""
        return ",".join(f"localhost:{wa.port.pg}" for wa in self.safekeepers)
@@ -1227,15 +1244,29 @@ class AbstractNeonCli(abc.ABC):
            env_vars[var] = val

        # Intercept CalledProcessError and print more info
-        res = subprocess.run(
-            args,
-            env=env_vars,
-            check=False,
-            universal_newlines=True,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            timeout=timeout,
-        )
+        try:
+            res = subprocess.run(
+                args,
+                env=env_vars,
+                check=False,
+                universal_newlines=True,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                timeout=timeout,
+            )
+        except subprocess.TimeoutExpired as e:
+            if e.stderr:
+                stderr = e.stderr.decode(errors="replace")
+            else:
+                stderr = ""
+
+            if e.stdout:
+                stdout = e.stdout.decode(errors="replace")
+            else:
+                stdout = ""
+
+            log.warn(f"CLI timeout: stderr={stderr}, stdout={stdout}")
+            raise

        indent = "  "
        if not res.returncode:
@@ -1286,6 +1317,8 @@ class NeonCli(AbstractNeonCli):
        tenant_id: Optional[TenantId] = None,
        timeline_id: Optional[TimelineId] = None,
        conf: Optional[Dict[str, str]] = None,
+        shard_count: Optional[int] = None,
+        shard_stripe_size: Optional[int] = None,
        set_default: bool = False,
    ) -> Tuple[TenantId, TimelineId]:
        """
@@ -1313,6 +1346,12 @@ class NeonCli(AbstractNeonCli):
        if set_default:
            args.append("--set-default")

+        if shard_count is not None:
+            args.extend(["--shard-count", str(shard_count)])
+
+        if shard_stripe_size is not None:
+            args.extend(["--shard-stripe-size", str(shard_stripe_size)])
+
        res = self.raw_cli(args)
        res.check_returncode()
        return tenant_id, timeline_id
@@ -1637,6 +1676,19 @@ class NeonCli(AbstractNeonCli):

        return self.raw_cli(args, check_return_code=True)

+    def tenant_migrate(
+        self, tenant_shard_id: TenantShardId, new_pageserver: int, timeout_secs: Optional[int]
+    ):
+        args = [
+            "tenant",
+            "migrate",
+            "--tenant-id",
+            str(tenant_shard_id),
+            "--id",
+            str(new_pageserver),
+        ]
+        return self.raw_cli(args, check_return_code=True, timeout=timeout_secs)
+
    def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]":
        return self.raw_cli(["start"], check_return_code=check_return_code)

@@ -1685,9 +1737,10 @@ class Pagectl(AbstractNeonCli):


 class NeonAttachmentService:
-    def __init__(self, env: NeonEnv):
+    def __init__(self, env: NeonEnv, auth_enabled):
        self.env = env
        self.running = False
+        self.auth_enabled = auth_enabled

    def start(self):
        assert not self.running
@@ -1701,27 +1754,50 @@ class NeonAttachmentService:
            self.running = False
        return self

-    def attach_hook_issue(self, tenant_id: TenantId, pageserver_id: int) -> int:
-        response = requests.post(
+    def request(self, method, *args, **kwargs) -> requests.Response:
+        kwargs["headers"] = self.headers()
+        return requests.request(method, *args, **kwargs)
+
+    def headers(self) -> Dict[str, str]:
+        headers = {}
+        if self.auth_enabled:
+            jwt_token = self.env.auth_keys.generate_pageserver_token()
+            headers["Authorization"] = f"Bearer {jwt_token}"
+
+        return headers
+
+    def attach_hook_issue(
+        self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
+    ) -> int:
+        response = self.request(
+            "POST",
            f"{self.env.control_plane_api}/attach-hook",
-            json={"tenant_id": str(tenant_id), "node_id": pageserver_id},
+            json={"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id},
+            headers=self.headers(),
        )
        response.raise_for_status()
        gen = response.json()["gen"]
        assert isinstance(gen, int)
        return gen

-    def attach_hook_drop(self, tenant_id: TenantId):
-        response = requests.post(
+    def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]):
+        response = self.request(
+            "POST",
            f"{self.env.control_plane_api}/attach-hook",
-            json={"tenant_id": str(tenant_id), "node_id": None},
+            json={"tenant_shard_id": str(tenant_shard_id), "node_id": None},
+            headers=self.headers(),
        )
        response.raise_for_status()

-    def inspect(self, tenant_id: TenantId) -> Optional[tuple[int, int]]:
-        response = requests.post(
+    def inspect(self, tenant_shard_id: Union[TenantId, TenantShardId]) -> Optional[tuple[int, int]]:
+        """
+        :return: 2-tuple of (generation, pageserver id), or None if unknown
+        """
+        response = self.request(
+            "POST",
            f"{self.env.control_plane_api}/inspect",
-            json={"tenant_id": str(tenant_id)},
+            json={"tenant_shard_id": str(tenant_shard_id)},
+            headers=self.headers(),
        )
        response.raise_for_status()
        json = response.json()
@@ -1732,6 +1808,79 @@ class NeonAttachmentService:
        else:
            return None

+    def node_register(self, node: NeonPageserver):
+        body = {
+            "node_id": int(node.id),
+            "listen_http_addr": "localhost",
+            "listen_http_port": node.service_port.http,
+        }
+        log.info(f"node_register({body})")
+        self.request(
+            "POST", f"{self.env.control_plane_api}/node", json=body, headers=self.headers()
+        ).raise_for_status()
+
+    def tenant_create(
+        self,
+        tenant_id: TenantId,
+        shard_count: Optional[int] = None,
+        shard_stripe_size: Optional[int] = None,
+        tenant_config: Optional[Dict[Any, Any]] = None,
+    ):
+        body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}
+
+        if shard_count is not None:
+            shard_params = {"count": shard_count}
+            if shard_stripe_size is not None:
+                shard_params["stripe_size"] = shard_stripe_size
+
+            body["shard_parameters"] = shard_params
+
+        if tenant_config is not None:
+            for k, v in tenant_config.items():
+                body[k] = v
+
+        response = self.request("POST", f"{self.env.control_plane_api}/tenant", json=body)
+        response.raise_for_status()
+        log.info(f"tenant_create success: {response.json()}")
+
+    def tenant_timeline_create(self, tenant_id: TenantId, timeline_id: TimelineId):
+        body: Dict[str, Any] = {"new_timeline_id": str(timeline_id)}
+
+        response = self.request(
+            "POST", f"{self.env.control_plane_api}/tenant/{tenant_id}/timeline", json=body
+        )
+        response.raise_for_status()
+        log.info(f"tenant_timeline_create success: {response.json()}")
+
+    def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
+        response = self.request("GET", f"{self.env.control_plane_api}/tenant/{tenant_id}/locate")
+        response.raise_for_status()
+        body = response.json()
+        shards: list[dict[str, Any]] = body["shards"]
+        return shards
+
+    def tenant_shard_split(self, tenant_id: TenantId, shard_count: int) -> list[TenantShardId]:
+        response = self.request(
+            "PUT",
+            f"{self.env.control_plane_api}/tenant/{tenant_id}/shard_split",
+            json={"new_shard_count": shard_count},
+        )
+        response.raise_for_status()
+        body = response.json()
+        log.info(f"tenant_shard_split success: {body}")
+        shards: list[TenantShardId] = body["new_shards"]
+        return shards
+
+    def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int):
+        response = self.request(
+            "PUT",
+            f"{self.env.control_plane_api}/tenant/{tenant_shard_id}/migrate",
+            json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id},
+        )
+        response.raise_for_status()
+        log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}")
+        assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id
+
    def __enter__(self) -> "NeonAttachmentService":
        return self

@@ -2478,33 +2627,6 @@ class NeonProxy(PgProtocol):
            assert response.status_code == kwargs["expected_code"], f"response: {response.json()}"
        return response.json()

-    async def http2_query(self, query, args, **kwargs):
-        # TODO maybe use default values if not provided
-        user = kwargs["user"]
-        password = kwargs["password"]
-        expected_code = kwargs.get("expected_code")
-
-        connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres"
-        async with httpx.AsyncClient(
-            http2=True, verify=str(self.test_output_dir / "proxy.crt")
-        ) as client:
-            response = await client.post(
-                f"https://{self.domain}:{self.external_http_port}/sql",
-                json={"query": query, "params": args},
-                headers={
-                    "Content-Type": "application/sql",
-                    "Neon-Connection-String": connstr,
-                    "Neon-Pool-Opt-In": "true",
-                },
-            )
-            assert response.http_version == "HTTP/2"
-
-            if expected_code is not None:
-                assert (
-                    response.status_code == kwargs["expected_code"]
-                ), f"response: {response.json()}"
-            return response.json()
-
    def get_metrics(self) -> str:
        request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics")
        request_result.raise_for_status()
@@ -2859,7 +2981,7 @@ class Endpoint(PgProtocol):
            hot_standby=hot_standby,
            lsn=lsn,
            pageserver_id=pageserver_id,
-        ).start(remote_ext_config=remote_ext_config)
+        ).start(remote_ext_config=remote_ext_config, pageserver_id=pageserver_id)

        log.info(f"Postgres startup took {time.time() - started_at} seconds")

@@ -3372,7 +3494,7 @@ def pytest_addoption(parser: Parser):


 SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile(  # type: ignore[type-arg]
-    r"config|config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql)"
+    r"config|config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql|conf)"
 )


@@ -3509,9 +3631,7 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:


 # pg is the existing and running compute node, that we want to compare with a basebackup
-def check_restored_datadir_content(
-    test_output_dir: Path, env: NeonEnv, endpoint: Endpoint, pageserver_id: Optional[int] = None
-):
+def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
    # Get the timeline ID. We need it for the 'basebackup' command
    timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])

@@ -3532,6 +3652,7 @@ def check_restored_datadir_content(
    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
    psql_path = os.path.join(pg_bin.pg_bin_path, "psql")

+    pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"]
    cmd = rf"""
        {psql_path}                                    \
            --no-psqlrc                                \
@@ -3600,6 +3721,38 @@ def logical_replication_sync(subscriber: VanillaPostgres, publisher: Endpoint) -
        time.sleep(0.5)


+def tenant_get_shards(
+    env: NeonEnv, tenant_id: TenantId, pageserver_id: Optional[int]
+) -> list[tuple[TenantShardId, NeonPageserver]]:
+    """
+    Helper for when you want to talk to one or more pageservers, and the
+    caller _might_ have specified a pageserver, or they might leave it to
+    us to figure out the shards for a tenant.
+
+    If the caller provides `pageserver_id`, it will be used for all shards, even
+    if the shard is indicated by attachment service to be on some other pageserver.
+
+    Caller should over the response to apply their per-pageserver action to
+    each shard
+    """
+    if pageserver_id is not None:
+        override_pageserver = [p for p in env.pageservers if p.id == pageserver_id][0]
+    else:
+        override_pageserver = None
+
+    if len(env.pageservers) > 1:
+        return [
+            (
+                TenantShardId.parse(s["shard_id"]),
+                override_pageserver or env.get_pageserver(s["node_id"]),
+            )
+            for s in env.attachment_service.locate(tenant_id)
+        ]
+    else:
+        # Assume an unsharded tenant
+        return [(TenantShardId(tenant_id, 0, 0), override_pageserver or env.pageserver)]
+
+
 def wait_for_last_flush_lsn(
    env: NeonEnv,
    endpoint: Endpoint,
@@ -3609,10 +3762,24 @@ def wait_for_last_flush_lsn(
 ) -> Lsn:
    """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""

+    shards = tenant_get_shards(env, tenant, pageserver_id)
+
    last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
-    return wait_for_last_record_lsn(
-        env.get_pageserver(pageserver_id).http_client(), tenant, timeline, last_flush_lsn
-    )
+
+    results = []
+    for tenant_shard_id, pageserver in shards:
+        log.info(
+            f"wait_for_last_flush_lsn: waiting for {last_flush_lsn} on shard {tenant_shard_id} on pageserver {pageserver.id})"
+        )
+        waited = wait_for_last_record_lsn(
+            pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn
+        )
+
+        assert waited >= last_flush_lsn
+        results.append(waited)
+
+    # Return the lowest LSN that has been ingested by all shards
+    return min(results)


 def wait_for_wal_insert_lsn(
@@ -3624,9 +3791,16 @@ def wait_for_wal_insert_lsn(
 ) -> Lsn:
    """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""
    last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_insert_lsn()")[0][0])
-    return wait_for_last_record_lsn(
-        env.get_pageserver(pageserver_id).http_client(), tenant, timeline, last_flush_lsn
-    )
+    result = None
+    for tenant_shard_id, pageserver in tenant_get_shards(env, tenant, pageserver_id):
+        shard_r = wait_for_last_record_lsn(
+            pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn
+        )
+        if result is None:
+            result = shard_r
+
+    assert result is not None
+    return result


 def fork_at_current_lsn(
@@ -3660,11 +3834,13 @@ def last_flush_lsn_upload(
    last_flush_lsn = wait_for_last_flush_lsn(
        env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver_id
    )
-    ps_http = env.get_pageserver(pageserver_id).http_client()
-    wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, last_flush_lsn)
-    # force a checkpoint to trigger upload
-    ps_http.timeline_checkpoint(tenant_id, timeline_id)
-    wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
+    shards = tenant_get_shards(env, tenant_id, pageserver_id)
+    for tenant_shard_id, pageserver in shards:
+        ps_http = pageserver.http_client()
+        wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, last_flush_lsn)
+        # force a checkpoint to trigger upload
+        ps_http.timeline_checkpoint(tenant_shard_id, timeline_id)
+        wait_for_upload(ps_http, tenant_shard_id, timeline_id, last_flush_lsn)
    return last_flush_lsn


--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -4,7 +4,7 @@ import json
 import time
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Set, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple, Union

 import requests
 from requests.adapters import HTTPAdapter
@@ -13,7 +13,7 @@ from urllib3.util.retry import Retry
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, parse_metrics
 from fixtures.pg_version import PgVersion
-from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import Fn


@@ -211,7 +211,7 @@ class PageserverHttpClient(requests.Session):

    def tenant_create(
        self,
-        new_tenant_id: TenantId,
+        new_tenant_id: Union[TenantId, TenantShardId],
        conf: Optional[Dict[str, Any]] = None,
        generation: Optional[int] = None,
    ) -> TenantId:
@@ -239,7 +239,7 @@ class PageserverHttpClient(requests.Session):

    def tenant_attach(
        self,
-        tenant_id: TenantId,
+        tenant_id: Union[TenantId, TenantShardId],
        config: None | Dict[str, Any] = None,
        config_null: bool = False,
        generation: Optional[int] = None,
@@ -269,7 +269,7 @@ class PageserverHttpClient(requests.Session):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
        self.verbose_error(res)

-    def tenant_reset(self, tenant_id: TenantId, drop_cache: bool):
+    def tenant_reset(self, tenant_id: Union[TenantId, TenantShardId], drop_cache: bool):
        params = {}
        if drop_cache:
            params["drop_cache"] = "true"
@@ -278,7 +278,7 @@ class PageserverHttpClient(requests.Session):
        self.verbose_error(res)

    def tenant_location_conf(
-        self, tenant_id: TenantId, location_conf=dict[str, Any], flush_ms=None
+        self, tenant_id: Union[TenantId, TenantShardId], location_conf=dict[str, Any], flush_ms=None
    ):
        body = location_conf.copy()
        body["tenant_id"] = str(tenant_id)
@@ -294,7 +294,7 @@ class PageserverHttpClient(requests.Session):
        )
        self.verbose_error(res)

-    def tenant_delete(self, tenant_id: TenantId):
+    def tenant_delete(self, tenant_id: Union[TenantId, TenantShardId]):
        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
        self.verbose_error(res)
        return res
@@ -310,27 +310,27 @@ class PageserverHttpClient(requests.Session):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore")
        self.verbose_error(res)

-    def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]:
+    def tenant_status(self, tenant_id: Union[TenantId, TenantShardId]) -> Dict[Any, Any]:
        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
        self.verbose_error(res)
        res_json = res.json()
        assert isinstance(res_json, dict)
        return res_json

-    def tenant_config(self, tenant_id: TenantId) -> TenantConfig:
+    def tenant_config(self, tenant_id: Union[TenantId, TenantShardId]) -> TenantConfig:
        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/config")
        self.verbose_error(res)
        return TenantConfig.from_json(res.json())

-    def tenant_heatmap_upload(self, tenant_id: TenantId):
+    def tenant_heatmap_upload(self, tenant_id: Union[TenantId, TenantShardId]):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload")
        self.verbose_error(res)

-    def tenant_secondary_download(self, tenant_id: TenantId):
+    def tenant_secondary_download(self, tenant_id: Union[TenantId, TenantShardId]):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download")
        self.verbose_error(res)

-    def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
+    def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]):
        assert "tenant_id" not in config.keys()
        res = self.put(
            f"http://localhost:{self.port}/v1/tenant/config",
@@ -352,10 +352,12 @@ class PageserverHttpClient(requests.Session):
                del current[key]
        self.set_tenant_config(tenant_id, current)

-    def tenant_size(self, tenant_id: TenantId) -> int:
+    def tenant_size(self, tenant_id: Union[TenantId, TenantShardId]) -> int:
        return self.tenant_size_and_modelinputs(tenant_id)[0]

-    def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[str, Any]]:
+    def tenant_size_and_modelinputs(
+        self, tenant_id: Union[TenantId, TenantShardId]
+    ) -> Tuple[int, Dict[str, Any]]:
        """
        Returns the tenant size, together with the model inputs as the second tuple item.
        """
@@ -370,7 +372,7 @@ class PageserverHttpClient(requests.Session):
        assert isinstance(inputs, dict)
        return (size, inputs)

-    def tenant_size_debug(self, tenant_id: TenantId) -> str:
+    def tenant_size_debug(self, tenant_id: Union[TenantId, TenantShardId]) -> str:
        """
        Returns the tenant size debug info, as an HTML string
        """
@@ -382,7 +384,7 @@ class PageserverHttpClient(requests.Session):

    def timeline_list(
        self,
-        tenant_id: TenantId,
+        tenant_id: Union[TenantId, TenantShardId],
        include_non_incremental_logical_size: bool = False,
        include_timeline_dir_layer_file_size_sum: bool = False,
    ) -> List[Dict[str, Any]]:
@@ -403,7 +405,7 @@ class PageserverHttpClient(requests.Session):
    def timeline_create(
        self,
        pg_version: PgVersion,
-        tenant_id: TenantId,
+        tenant_id: Union[TenantId, TenantShardId],
        new_timeline_id: TimelineId,
        ancestor_timeline_id: Optional[TimelineId] = None,
        ancestor_start_lsn: Optional[Lsn] = None,
@@ -437,7 +439,7 @@ class PageserverHttpClient(requests.Session):

    def timeline_detail(
        self,
-        tenant_id: TenantId,
+        tenant_id: Union[TenantId, TenantShardId],
        timeline_id: TimelineId,
        include_non_incremental_logical_size: bool = False,
        include_timeline_dir_layer_file_size_sum: bool = False,
@@ -462,7 +464,9 @@ class PageserverHttpClient(requests.Session):
        assert isinstance(res_json, dict)
        return res_json

-    def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId, **kwargs):
+    def timeline_delete(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, **kwargs
+    ):
        """
        Note that deletion is not instant, it is scheduled and performed mostly in the background.
        So if you need to wait for it to complete use `timeline_delete_wait_completed`.
@@ -476,7 +480,10 @@ class PageserverHttpClient(requests.Session):
        assert res_json is None

    def timeline_gc(
-        self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int]
+        self,
+        tenant_id: Union[TenantId, TenantShardId],
+        timeline_id: TimelineId,
+        gc_horizon: Optional[int],
    ) -> dict[str, Any]:
        """
        Unlike most handlers, this will wait for the layers to be actually
@@ -499,7 +506,10 @@ class PageserverHttpClient(requests.Session):
        return res_json

    def timeline_compact(
-        self, tenant_id: TenantId, timeline_id: TimelineId, force_repartition=False
+        self,
+        tenant_id: Union[TenantId, TenantShardId],
+        timeline_id: TimelineId,
+        force_repartition=False,
    ):
        self.is_testing_enabled_or_skip()
        query = {}
@@ -518,7 +528,7 @@ class PageserverHttpClient(requests.Session):

    def timeline_get_lsn_by_timestamp(
        self,
-        tenant_id: TenantId,
+        tenant_id: Union[TenantId, TenantShardId],
        timeline_id: TimelineId,
        timestamp,
        version: Optional[int] = None,
@@ -537,7 +547,9 @@ class PageserverHttpClient(requests.Session):
        res_json = res.json()
        return res_json

-    def timeline_get_timestamp_of_lsn(self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn):
+    def timeline_get_timestamp_of_lsn(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn
+    ):
        log.info(f"Requesting time range of lsn {lsn}, tenant {tenant_id}, timeline {timeline_id}")
        res = self.get(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn?lsn={lsn}",
@@ -547,7 +559,10 @@ class PageserverHttpClient(requests.Session):
        return res_json

    def timeline_checkpoint(
-        self, tenant_id: TenantId, timeline_id: TimelineId, force_repartition=False
+        self,
+        tenant_id: Union[TenantId, TenantShardId],
+        timeline_id: TimelineId,
+        force_repartition=False,
    ):
        self.is_testing_enabled_or_skip()
        query = {}
@@ -566,7 +581,7 @@ class PageserverHttpClient(requests.Session):

    def timeline_spawn_download_remote_layers(
        self,
-        tenant_id: TenantId,
+        tenant_id: Union[TenantId, TenantShardId],
        timeline_id: TimelineId,
        max_concurrent_downloads: int,
    ) -> dict[str, Any]:
@@ -585,7 +600,7 @@ class PageserverHttpClient(requests.Session):

    def timeline_poll_download_remote_layers_status(
        self,
-        tenant_id: TenantId,
+        tenant_id: Union[TenantId, TenantShardId],
        timeline_id: TimelineId,
        spawn_response: dict[str, Any],
        poll_state=None,
@@ -607,7 +622,7 @@ class PageserverHttpClient(requests.Session):

    def timeline_download_remote_layers(
        self,
-        tenant_id: TenantId,
+        tenant_id: Union[TenantId, TenantShardId],
        timeline_id: TimelineId,
        max_concurrent_downloads: int,
        errors_ok=False,
@@ -689,9 +704,37 @@ class PageserverHttpClient(requests.Session):
        assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
        return results[0].value

+    def get_metrics_values(
+        self, names: list[str], filter: Optional[Dict[str, str]] = None
+    ) -> Dict[str, float]:
+        """
+        When fetching multiple named metrics, it is more efficient to use this
+        than to call `get_metric_value` repeatedly.
+
+        Throws RuntimeError if no metrics matching `names` are found, or if
+        not all of `names` are found: this method is intended for loading sets
+        of metrics whose existence is coupled.
+        """
+        metrics = self.get_metrics()
+        samples = []
+        for name in names:
+            samples.extend(metrics.query_all(name, filter=filter))
+
+        result = {}
+        for sample in samples:
+            if sample.name in result:
+                raise RuntimeError(f"Multiple values found for {sample.name}")
+            result[sample.name] = sample.value
+
+        if len(result) != len(names):
+            log.info(f"Metrics found: {metrics.metrics}")
+            raise RuntimeError(f"could not find all metrics {' '.join(names)}")
+
+        return result
+
    def layer_map_info(
        self,
-        tenant_id: TenantId,
+        tenant_id: Union[TenantId, TenantShardId],
        timeline_id: TimelineId,
    ) -> LayerMapInfo:
        res = self.get(
@@ -700,7 +743,9 @@ class PageserverHttpClient(requests.Session):
        self.verbose_error(res)
        return LayerMapInfo.from_json(res.json())

-    def download_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str):
+    def download_layer(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
+    ):
        res = self.get(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}",
        )
@@ -708,14 +753,18 @@ class PageserverHttpClient(requests.Session):

        assert res.status_code == 200

-    def download_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId):
+    def download_all_layers(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    ):
        info = self.layer_map_info(tenant_id, timeline_id)
        for layer in info.historic_layers:
            if not layer.remote:
                continue
            self.download_layer(tenant_id, timeline_id, layer.layer_file_name)

-    def evict_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str):
+    def evict_layer(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
+    ):
        res = self.delete(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}",
        )
@@ -723,7 +772,7 @@ class PageserverHttpClient(requests.Session):

        assert res.status_code in (200, 304)

-    def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId):
+    def evict_all_layers(self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId):
        info = self.layer_map_info(tenant_id, timeline_id)
        for layer in info.historic_layers:
            self.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
@@ -736,7 +785,7 @@ class PageserverHttpClient(requests.Session):
        self.verbose_error(res)
        return res.json()

-    def tenant_break(self, tenant_id: TenantId):
+    def tenant_break(self, tenant_id: Union[TenantId, TenantShardId]):
        res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break")
        self.verbose_error(res)

--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -1,12 +1,12 @@
 import time
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union

 from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef, ObjectTypeDef

 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.remote_storage import RemoteStorageKind, S3Storage
-from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import wait_until


@@ -22,7 +22,9 @@ def assert_tenant_state(


 def remote_consistent_lsn(
-    pageserver_http: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
+    pageserver_http: PageserverHttpClient,
+    tenant: Union[TenantId, TenantShardId],
+    timeline: TimelineId,
 ) -> Lsn:
    detail = pageserver_http.timeline_detail(tenant, timeline)

@@ -39,7 +41,7 @@ def remote_consistent_lsn(

 def wait_for_upload(
    pageserver_http: PageserverHttpClient,
-    tenant: TenantId,
+    tenant: Union[TenantId, TenantShardId],
    timeline: TimelineId,
    lsn: Lsn,
 ):
@@ -92,7 +94,7 @@ def wait_until_tenant_state(

 def wait_until_timeline_state(
    pageserver_http: PageserverHttpClient,
-    tenant_id: TenantId,
+    tenant_id: Union[TenantId, TenantShardId],
    timeline_id: TimelineId,
    expected_state: str,
    iterations: int,
@@ -141,7 +143,9 @@ def wait_until_tenant_active(


 def last_record_lsn(
-    pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
+    pageserver_http_client: PageserverHttpClient,
+    tenant: Union[TenantId, TenantShardId],
+    timeline: TimelineId,
 ) -> Lsn:
    detail = pageserver_http_client.timeline_detail(tenant, timeline)

@@ -152,7 +156,7 @@ def last_record_lsn(

 def wait_for_last_record_lsn(
    pageserver_http: PageserverHttpClient,
-    tenant: TenantId,
+    tenant: Union[TenantId, TenantShardId],
    timeline: TimelineId,
    lsn: Lsn,
 ) -> Lsn:
@@ -194,7 +198,7 @@ def wait_for_upload_queue_empty(

 def wait_timeline_detail_404(
    pageserver_http: PageserverHttpClient,
-    tenant_id: TenantId,
+    tenant_id: Union[TenantId, TenantShardId],
    timeline_id: TimelineId,
    iterations: int,
    interval: Optional[float] = None,
@@ -219,7 +223,7 @@ def wait_timeline_detail_404(

 def timeline_delete_wait_completed(
    pageserver_http: PageserverHttpClient,
-    tenant_id: TenantId,
+    tenant_id: Union[TenantId, TenantShardId],
    timeline_id: TimelineId,
    iterations: int = 20,
    interval: Optional[float] = None,
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -5,6 +5,7 @@ from fixtures.neon_fixtures import (
    Endpoint,
    NeonEnv,
    last_flush_lsn_upload,
+    tenant_get_shards,
    wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
@@ -31,7 +32,7 @@ class Workload:

        self._endpoint: Optional[Endpoint] = None

-    def endpoint(self, pageserver_id: int) -> Endpoint:
+    def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint:
        if self._endpoint is None:
            self._endpoint = self.env.endpoints.create(
                "main",
@@ -54,7 +55,7 @@ class Workload:
        if self._endpoint is not None:
            self._endpoint.stop()

-    def init(self, pageserver_id: int):
+    def init(self, pageserver_id: Optional[int] = None):
        endpoint = self.endpoint(pageserver_id)

        endpoint.safe_psql(f"CREATE TABLE {self.table} (id INTEGER PRIMARY KEY, val text);")
@@ -63,7 +64,7 @@ class Workload:
            self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
        )

-    def write_rows(self, n, pageserver_id):
+    def write_rows(self, n, pageserver_id: Optional[int] = None):
        endpoint = self.endpoint(pageserver_id)
        start = self.expect_rows
        end = start + n - 1
@@ -81,7 +82,7 @@ class Workload:
            self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
        )

-    def churn_rows(self, n, pageserver_id, upload=True):
+    def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True):
        assert self.expect_rows >= n

        max_iters = 10
@@ -119,21 +120,24 @@ class Workload:
                ]
            )

-        last_flush_lsn = wait_for_last_flush_lsn(
-            self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
-        )
-        ps_http = self.env.get_pageserver(pageserver_id).http_client()
-        wait_for_last_record_lsn(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn)
+        for tenant_shard_id, pageserver in tenant_get_shards(
+            self.env, self.tenant_id, pageserver_id
+        ):
+            last_flush_lsn = wait_for_last_flush_lsn(
+                self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
+            )
+            ps_http = pageserver.http_client()
+            wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)

-        if upload:
-            # force a checkpoint to trigger upload
-            ps_http.timeline_checkpoint(self.tenant_id, self.timeline_id)
-            wait_for_upload(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn)
-            log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
-        else:
-            log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
+            if upload:
+                # force a checkpoint to trigger upload
+                ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id)
+                wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
+                log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
+            else:
+                log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")

-    def validate(self, pageserver_id):
+    def validate(self, pageserver_id: Optional[int] = None):
        endpoint = self.endpoint(pageserver_id)
        result = endpoint.safe_psql_many(
            [
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -61,7 +61,7 @@ def measure_recovery_time(env: NeonCompare):
    # of view, but the same as far as the safekeeper/WAL is concerned.  To work around that,
    # we will explicitly create the tenant in the same generation that it was previously
    # attached in.
-    attach_status = env.env.attachment_service.inspect(tenant_id=env.tenant)
+    attach_status = env.env.attachment_service.inspect(tenant_shard_id=env.tenant)
    assert attach_status is not None
    (attach_gen, _) = attach_status

--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -10,6 +10,7 @@ from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    wait_for_last_flush_lsn,
 )
+from fixtures.pg_version import PgVersion
 from fixtures.types import TenantId, TimelineId


@@ -126,7 +127,7 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder)
    # Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed.
    pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return"))
    with pytest.raises(Exception, match="before-checkpoint-new-timeline"):
-        _ = env.neon_cli.create_timeline("test_timeline_init_break_before_checkpoint", tenant_id)
+        _ = pageserver_http.timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate())

    # Restart the page server
    env.pageserver.restart(immediate=True)
@@ -160,7 +161,7 @@ def test_timeline_init_break_before_checkpoint_recreate(
        ]
    )

-    env.pageserver.tenant_create(env.initial_tenant)
+    env.neon_cli.create_tenant(env.initial_tenant)
    tenant_id = env.initial_tenant

    timelines_dir = env.pageserver.timeline_dir(tenant_id)
@@ -216,7 +217,7 @@ def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilde
    # Introduce failpoint when creating a new timeline uninit mark, before any other files were created
    pageserver_http.configure_failpoints(("after-timeline-uninit-mark-creation", "return"))
    with pytest.raises(Exception, match="after-timeline-uninit-mark-creation"):
-        _ = env.neon_cli.create_timeline("test_timeline_create_break_after_uninit_mark", tenant_id)
+        _ = pageserver_http.timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate())

    # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
    # "New" timeline is not present in the list, allowing pageserver to retry the same request
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Conrad Ludgate	a65a5c372b	storage broker	2024-01-18 14:43:29 +00:00
Conrad Ludgate	2cf85471f5	proxy	2024-01-18 14:11:35 +00:00
Conrad Ludgate	665f4ff4b5	move to hyper 1.0 mostly	2024-01-18 12:44:50 +00:00
John Spray	bd19290d9f	pageserver: add shard_id to metric labels (#6308 ) ## Problem tenant_id/timeline_id is no longer a full identifier for metrics from a `Tenant` or `Timeline` object. Closes: https://github.com/neondatabase/neon/issues/5953 ## Summary of changes Include `shard_id` label everywhere we have `tenant_id`/`timeline_id` label.	2024-01-18 10:52:18 +00:00
Joonas Koivunen	a584e300d1	test: figure out the relative eviction order assertions (#6375 ) I just failed to see this earlier on #6136. layer counts are used as an abstraction, and each of the two tenants lose proportionally about the same amount of layers. sadly there is no difference in between `relative_spare` and `relative_equal` as both of these end up evicting the exact same amount of layers, but I'll try to add later another test for those. Cc: #5304	2024-01-18 12:39:45 +02:00
Joonas Koivunen	e247ddbddc	build: update h2 (#6383 ) Notes: https://github.com/hyperium/h2/releases/tag/v0.3.24 Related: https://rustsec.org/advisories/RUSTSEC-2024-0003	2024-01-18 09:54:15 +00:00
Konstantin Knizhnik	0dc4c9b0b8	Relsize hash lru eviction (#6353 ) ## Problem Currently relation hash size is limited by "neon.relsize_hash_size" GUC with default value 64k. 64k relations is not so small number... but it is enough to create 376 databases to exhaust it. ## Summary of changes Use LRU replacement algorithm to prevent hash overflow ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2024-01-17 20:34:30 +02:00
John Spray	b6ec11ad78	control_plane: generalize attachment_service to handle sharding (#6251 ) ## Problem To test sharding, we need something to control it. We could write python code for doing this from the test runner, but this wouldn't be usable with neon_local run directly, and when we want to write tests with large number of shards/tenants, Rust is a better fit efficiently handling all the required state. This service enables automated tests to easily get a system with sharding/HA without the test itself having to set this all up by hand: existing tests can be run against sharded tenants just by setting a shard count when creating the tenant. ## Summary of changes Attachment service was previously a map of TenantId->TenantState, where the principal state stored for each tenant was the generation and the last attached pageserver. This enabled it to serve the re-attach and validate requests that the pageserver requires. In this PR, the scope of the service is extended substantially to do overall management of tenants in the pageserver, including tenant/timeline creation, live migration, evacuation of offline pageservers etc. This is done using synchronous code to make declarative changes to the tenant's intended state (`TenantState.policy` and `TenantState.intent`), which are then translated into calls into the pageserver by the `Reconciler`. Top level summary of modules within `control_plane/attachment_service/src`: - `tenant_state`: structure that represents one tenant shard. - `service`: implements the main high level such as tenant/timeline creation, marking a node offline, etc. - `scheduler`: for operations that need to pick a pageserver for a tenant, construct a scheduler and call into it. - `compute_hook`: receive notifications when a tenant shard is attached somewhere new. Once we have locations for all the shards in a tenant, emit an update to postgres configuration via the neon_local `LocalEnv`. - `http`: HTTP stubs. These mostly map to methods on `Service`, but are separated for readability and so that it'll be easier to adapt if/when we switch to another RPC layer. - `node`: structure that describes a pageserver node. The most important attribute of a node is its availability: marking a node offline causes tenant shards to reschedule away from it. This PR is a precursor to implementing the full sharding service for prod (#6342). What's the difference between this and a production-ready controller for pageservers? - JSON file persistence to be replaced with a database - Limited observability. - No concurrency limits. Marking a pageserver offline will try and migrate every tenant to a new pageserver concurrently, even if there are thousands. - Very simple scheduler that only knows to pick the pageserver with fewest tenants, and place secondary locations on a different pageserver than attached locations: it does not try to place shards for the same tenant on different pageservers. This matters little in tests, because picking the least-used pageserver usually results in round-robin placement. - Scheduler state is rebuilt exhaustively for each operation that requires a scheduler. - Relies on neon_local mechanisms for updating postgres: in production this would be something that flows through the real control plane. --------- Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>	2024-01-17 18:01:08 +00:00
John Spray	4cec95ba13	pageserver: add list API for LocationConf (#6329 ) ## Problem The `/v1/tenant` listing API only applies to attached tenants. For an external service to implement a global reconciliation of its list of shards vs. what's on the pageserver, we need a full view of what's in TenantManager, including secondary tenant locations, and InProgress locations. Dependency of https://github.com/neondatabase/neon/pull/6251 ## Summary of changes - Add methods to Tenant and SecondaryTenant to reconstruct the LocationConf used to create them. - Add `GET /v1/location_config` API	2024-01-17 13:34:51 +00:00
Arpad Müller	ab86060d97	Copy initdb if loading from different timeline ID (#6363 ) Previously, if we: 1. created a new timeline B from a different timeline's A initdb 2. deleted timeline A the initdb for timeline B would be gone, at least in a world where we are deleting initdbs upon timeline deletion. This world is imminent (#6226). Therefore, if the pageserver is instructed to load the initdb from a different timeline ID, copy it to the newly created timeline's directory in S3. This ensures that we can disaster recover the new timeline as well, regardless of whether the original timeline was deleted or not. Part of https://github.com/neondatabase/neon/issues/5282.	2024-01-17 12:42:42 +01:00
Arpad Müller	6ffdcfe6a4	remote_storage: unify azure and S3 tests (#6364 ) The remote_storage crate contains two copies of each test, one for azure and one for S3. The repetition is not necessary and makes the tests more prone to drift, so we remove it by moving the tests into a shared module. The module has a different name depending on where it is included, so that each test still has "s3" or "azure" in its full path, allowing you to just test the S3 test or just the azure tests. Earlier PR that removed some duplication already: #6176 Fixes #6146.	2024-01-16 18:45:19 +01:00
Arpad Müller	4b0204ede5	Add copy operation tests and implement them for azure blobs (#6362 ) This implements the `copy` operation for azure blobs, added to S3 by #6091, and adds tests both to s3 and azure ensuring that the copy operation works.	2024-01-16 12:07:20 +00:00
John Spray	bf4e708646	pageserver: eviction for secondary mode tenants (#6225 ) Follows #6123 Closes: https://github.com/neondatabase/neon/issues/5342 The approach here is to avoid using `Layer` from secondary tenants, and instead make the eviction types (e.g. `EvictionCandidate`) have a variant that carries a Layer for attached tenants, and a different variant for secondary tenants. Other changes: - EvictionCandidate no longer carries a `Timeline`: this was only used for providing a witness reference to remote timeline client. - The types for returning eviction candidates are all in disk_usage_eviction_task.rs now, whereas some of them were in timeline.rs before. - The EvictionCandidate type replaces LocalLayerInfoForDiskUsageEviction type, which was basically the same thing.	2024-01-16 10:29:26 +00:00
John Spray	887e94d7da	page_service: more efficient page_service -> shard lookup (#6037 ) ## Problem In #5980 the page service connection handler gets a simple piece of logic for finding the right Timeline: at connection time, it picks an arbitrary Timeline, and then when handling individual page requests it checks if the original timeline is the correct shard, and if not looks one up. This is pretty slow in the case where we have to go look up the other timeline, because we take the big tenants manager lock. ## Summary of changes - Add a `shard_timelines` map of ShardIndex to Timeline on the page service connection handler - When looking up a Timeline for a particular ShardIndex, consult `shard_timelines` to avoid hitting the TenantsManager unless we really need to. - Re-work the CancellationToken handling, because the handler now holds gateguards on multiple timelines, and so must respect cancellation of _any_ timeline it has in its cache, not just the timeline related to the request it is currently servicing. --------- Co-authored-by: Vlad Lazar <vlad@neon.tech>	2024-01-16 09:39:19 +00:00
John Spray	df9e9de541	pageserver: API updates for sharding (#6330 ) The theme of the changes in this PR is that they're enablers for #6251 which are superficial struct/api changes. This is a spinoff from #6251: - Various APIs + clients thereof take TenantShardId rather than TenantId - The creation API gets a ShardParameters member, which may be used to configure shard count and stripe size. This enables the attachment service to present a "virtual pageserver" creation endpoint that creates multiple shards. - The attachment service will use tenant size information to drive shard splitting. Make a version of `TenantHistorySize` that is usable for decoding these API responses. - ComputeSpec includes a shard stripe size.	2024-01-16 09:21:00 +00:00
Anna Khanova	3f2187eb92	Proxy relax sni check (#6323 ) ## Problem Using the same domain name () for serverless driver can help with connection caching. https://github.com/neondatabase/neon/issues/6290 ## Summary of changes Relax SNI check.	2024-01-16 08:42:13 +00:00
John Khvatov	2a3cfc9665	Remove PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME histogram. (#6356 ) Fixes #6343. ## Problem PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME is used on hot path and it adds noticeable latency to GetPage@LSN. ## Refs https://discordapp.com/channels/1176467419317940276/1195022264115151001/1196370689268125716	2024-01-15 17:19:19 +01:00
Cihan Demirci	d34adf46b4	do not provide disclaimer input for the deploy-prod workflow (#6360 ) We've removed this input from the deploy-prod workflow.	2024-01-15 16:15:34 +00:00
Conrad Ludgate	0bac8ddd76	proxy: fix serverless error message info (#6279 ) ## Problem https://github.com/neondatabase/serverless/issues/51#issuecomment-1878677318 ## Summary of changes 1. When we have a db_error, use db_error.message() as the message. 2. include error position. 3. line should be a string (weird?) 4. `datatype` -> `dataType`	2024-01-15 16:43:19 +01:00
Christian Schwarz	0e1ef3713e	fix(pagebench): #6325 broke running without `--runtime` (#6351 ) After PR #6325, when running without --runtime, we wouldn't wait for start_work_barrier, causing the benchmark to not start at all.	2024-01-15 08:54:19 +00:00
Konstantin Knizhnik	31a4eb40b2	Do not suspend compute if autovacuum is active (#6322 ) ## Problem Se.e https://github.com/orgs/neondatabase/projects/49/views/13?pane=issue&itemId=48282912 ## Summary of changes Do not suspend compute if there are active auto vacuum workers ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2024-01-14 09:33:57 +02:00
Arpad Müller	60ced06586	Fix timeline creation and tenant deletion race (#6310 ) Fixes the race condition between timeline creation and tenant deletion outlined in #6255. Related: #5914, which is a similar race condition about the uninit marker file. Fixes #6255	2024-01-13 09:15:58 +01:00
Christian Schwarz	b76454ae41	add script to set up EC2 storage-optimized instance store for benchmarking (#6350 ) Been using this all the time in https://github.com/neondatabase/neon/pull/6214 Part of https://github.com/neondatabase/neon/issues/5771 Should consider this in https://github.com/neondatabase/neon/issues/6297	2024-01-12 19:25:17 +00:00
Arthur Petukhovsky	97b48c23f8	Compact some compute_ctl logs (#6346 ) Print postgres roles in a single line and add some info.	2024-01-12 18:24:22 +00:00
Christian Schwarz	cd48ea784f	TenantInfo: expose generation number (#6348 ) Generally useful when debugging / troubleshooting. I found this useful when manually duplicating a tenant from a script[^1] where I can't use `neon_fixtures.Pageserver.tenant_attach`'s automatic integration with the neon_local's attachment_service. [^1]: https://github.com/neondatabase/neon/pull/6349	2024-01-12 18:27:11 +01:00
Alexey Kondratov	1c432d5492	[compute_ctl] Do not miss short-living connections (#6008 ) ## Problem Currently, activity monitor in `compute_ctl` has 500 ms polling interval. It also looks on the list of current client backends looking for an active one or one with the most recent state change. This means we can miss short-living connections. Yet, during testing this PR I realized that it's usually not a problem with pooled connection, as pgbouncer maintains connections to Postgres even though client connection are short-living. We can still miss direct connections. ## Summary of changes This commit introduces another way to detect user activity on the compute. It polls a sum of `active_time` and sum of `sessions` from all non-system databases in the `pg_stat_database` [1]. If user runs some queries or just open a direct connection, it will rise; if user will drop db, it can go down, but it's still a change and will be detected as activity. New statistic-based logic seems to be working fine. Yet, after having it running for a couple of hours I've seen several odd cases with connections via pgbouncer: 1. Sometimes, if you run just `psql pooler_connstr -c 'select 1;'` `active_time` could be not updated immediately, and it may take a couple of dozens of seconds. This doesn't seem critical, though. 2. Same query with pooler, `active_time` can be bumped a bit, then pgbouncer keeps open connection to Postgres for ~10 minutes, then it disconnects, and `active_time` could be bumped a bit again. 'Could be' because I've seen it once, but it didn't reproduce for a second try. I think this can create false-positives (hopefully rare), when we will not suspend some computes because of lagged statistics update OR because some non-user processes will try to connect to user databases. Currently, we don't touch them outside of startup and `postgres_exporter` is configured to do not discover other databases, but this can change in the future. New behavior is covered by feature flag `activity_monitor_experimental`, which should be provided by control plane via neondatabase/cloud#9171 [1] https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-DATABASE-VIEW Related to neondatabase/cloud#7966, neondatabase/cloud#7198	2024-01-12 18:15:41 +01:00
Vlad Lazar	02c6abadf0	pageserver: remove depenency of pagebench on pageserver (#6334 ) To achieve this I had to lift the BlockNumber and key_to_rel_block definitions to pageserver_api (similar to a change in #5980). Closes #6299	2024-01-12 17:11:19 +00:00
John Spray	7af4c676c0	pageserver: only upload initdb from shard 0 (#6331 ) ## Problem When creating a timeline on a sharded tenant, we call into each shard. We don't need to upload the initdb from every shard: only do it on shard zero. ## Summary of changes - Move the initdb upload into a function, and only call it on shard zero.	2024-01-12 15:32:27 +01:00