avoid traversing the anyhow Cause chain

Observation: there was only a thin layer of anyhow between the types for which log_compaction_error chased down the cause chain and the types and the conversion to CompactionError::Other. So, remove the implicit #[from] conversion generated by thiserror, and de-`anyhow`ify / explicitly opt-into-`::Other` all the places that used it previously.
driveby: align error stringifications
2026-01-30 08:40:37 +00:00 · 2024-01-23 20:08:31 +01:00 · 2024-01-23 09:54:42 +00:00 · 2024-01-23 09:54:42 +00:00 · 2024-01-23 09:54:42 +00:00 · 2024-01-23 09:54:42 +00:00
115 changed files with 7073 additions and 2612 deletions
--- a/.config/nextest.toml
+++ b/.config/nextest.toml
@@ -1,2 +1,2 @@
 [profile.default]
-slow-timeout = "1m"
+slow-timeout = { period = "20s", terminate-after = 3 }
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -270,6 +270,32 @@ dependencies = [
 "critical-section",
 ]

+[[package]]
+name = "attachment_service"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "camino",
+ "clap",
+ "control_plane",
+ "futures",
+ "git-version",
+ "hyper",
+ "metrics",
+ "pageserver_api",
+ "pageserver_client",
+ "postgres_backend",
+ "postgres_connection",
+ "serde",
+ "serde_json",
+ "thiserror",
+ "tokio",
+ "tokio-util",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -1748,6 +1774,12 @@ dependencies = [
 "termcolor",
 ]

+[[package]]
+name = "equivalent"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
+
 [[package]]
 name = "errno"
 version = "0.3.1"
@@ -2106,9 +2138,9 @@ dependencies = [

 [[package]]
 name = "h2"
-version = "0.3.19"
+version = "0.3.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d357c7ae988e7d2182f7d7871d0b963962420b0678b0997ce7de72001aeab782"
+checksum = "bb2c4422095b67ee78da96fbb51a4cc413b3b25883c7717ff7ca1ab31022c9c9"
 dependencies = [
 "bytes",
 "fnv",
@@ -2116,7 +2148,7 @@ dependencies = [
 "futures-sink",
 "futures-util",
 "http",
- "indexmap",
+ "indexmap 2.0.1",
 "slab",
 "tokio",
 "tokio-util",
@@ -2452,6 +2484,16 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "indexmap"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
+dependencies = [
+ "equivalent",
+ "hashbrown 0.14.0",
+]
+
 [[package]]
 name = "infer"
 version = "0.2.3"
@@ -3130,7 +3172,7 @@ dependencies = [
 "fnv",
 "futures-channel",
 "futures-util",
- "indexmap",
+ "indexmap 1.9.3",
 "once_cell",
 "pin-project-lite",
 "thiserror",
@@ -3340,6 +3382,7 @@ dependencies = [
 "const_format",
 "enum-map",
 "hex",
+ "humantime-serde",
 "postgres_ffi",
 "rand 0.8.5",
 "serde",
@@ -3524,7 +3567,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4dd7d28ee937e54fe3080c91faa1c3a46c06de6252988a7f4592ba2310ef22a4"
 dependencies = [
 "fixedbitset",
- "indexmap",
+ "indexmap 1.9.3",
 ]

 [[package]]
@@ -3948,6 +3991,7 @@ dependencies = [
 "url",
 "utils",
 "uuid",
+ "walkdir",
 "webpki-roots 0.25.2",
 "workspace_hack",
 "x509-parser",
@@ -4929,7 +4973,7 @@ dependencies = [
 "base64 0.13.1",
 "chrono",
 "hex",
- "indexmap",
+ "indexmap 1.9.3",
 "serde",
 "serde_json",
 "serde_with_macros",
@@ -4987,9 +5031,9 @@ dependencies = [

 [[package]]
 name = "shlex"
-version = "1.1.0"
+version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3"
+checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"

 [[package]]
 name = "signal-hook"
@@ -5630,7 +5674,7 @@ version = "0.19.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739"
 dependencies = [
- "indexmap",
+ "indexmap 1.9.3",
 "serde",
 "serde_spanned",
 "toml_datetime",
@@ -5722,7 +5766,7 @@ checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
 dependencies = [
 "futures-core",
 "futures-util",
- "indexmap",
+ "indexmap 1.9.3",
 "pin-project",
 "pin-project-lite",
 "rand 0.8.5",
@@ -6593,9 +6637,11 @@ dependencies = [
 "futures-sink",
 "futures-util",
 "getrandom 0.2.11",
+ "hashbrown 0.14.0",
 "hex",
 "hmac",
 "hyper",
+ "indexmap 1.9.3",
 "itertools",
 "libc",
 "log",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,6 +3,7 @@ resolver = "2"
 members = [
    "compute_tools",
    "control_plane",
+    "control_plane/attachment_service",
    "pageserver",
    "pageserver/ctl",
    "pageserver/client",
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -143,6 +143,8 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
 #########################################################################################
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ARG PG_VERSION
 RUN apt update && \
    apt install -y ninja-build python3-dev libncurses5 binutils clang

@@ -617,6 +619,7 @@ RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O
 FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

+ARG PG_VERSION
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
      "v14" | "v15") \
@@ -779,6 +782,8 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
 #
 #########################################################################################
 FROM build-deps AS neon-pg-ext-build
+ARG PG_VERSION
+
 # Public extensions
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=postgis-build /sfcgal/* /
@@ -883,8 +888,10 @@ FROM debian:bullseye-slim
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
    echo "postgres:test_console_pass" | chpasswd && \
    mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
+    mkdir /var/db/postgres/pgbouncer && \
    chown -R postgres:postgres /var/db/postgres && \
    chmod 0750 /var/db/postgres/compute && \
+    chmod 0750 /var/db/postgres/pgbouncer && \
    echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
    # create folder for file cache
    mkdir -p -m 777 /neon/cache
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -32,8 +32,6 @@
 //!             -S /var/db/postgres/specs/current.json \
 //!             -b /usr/local/bin/postgres \
 //!             -r http://pg-ext-s3-gateway \
-//!             --pgbouncer-connstr 'host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable'
-//!             --pgbouncer-ini-path /etc/pgbouncer.ini \
 //! ```
 //!
 use std::collections::HashMap;
@@ -112,9 +110,6 @@ fn main() -> Result<()> {
    let spec_json = matches.get_one::<String>("spec");
    let spec_path = matches.get_one::<String>("spec-path");

-    let pgbouncer_connstr = matches.get_one::<String>("pgbouncer-connstr");
-    let pgbouncer_ini_path = matches.get_one::<String>("pgbouncer-ini-path");
-
    // Extract OpenTelemetry context for the startup actions from the
    // TRACEPARENT and TRACESTATE env variables, and attach it to the current
    // tracing context.
@@ -225,8 +220,6 @@ fn main() -> Result<()> {
        ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
        ext_download_progress: RwLock::new(HashMap::new()),
        build_tag,
-        pgbouncer_connstr: pgbouncer_connstr.map(|s| s.to_string()),
-        pgbouncer_ini_path: pgbouncer_ini_path.map(|s| s.to_string()),
    };
    let compute = Arc::new(compute_node);

@@ -523,23 +516,6 @@ fn cli() -> clap::Command {
                )
                .value_name("FILECACHE_CONNSTR"),
        )
-        .arg(
-            Arg::new("pgbouncer-connstr")
-                .long("pgbouncer-connstr")
-                .default_value(
-                    "host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable",
-                )
-                .value_name("PGBOUNCER_CONNSTR"),
-        )
-        .arg(
-            Arg::new("pgbouncer-ini-path")
-                .long("pgbouncer-ini-path")
-                // Note: this doesn't match current path for pgbouncer.ini.
-                // Until we fix it, we need to pass the path explicitly
-                // or this will be effectively no-op.
-                .default_value("/etc/pgbouncer.ini")
-                .value_name("PGBOUNCER_INI_PATH"),
-        )
 }

 /// When compute_ctl is killed, send also termination signal to sync-safekeepers
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -71,10 +71,6 @@ pub struct ComputeNode {
    // key: ext_archive_name, value: started download time, download_completed?
    pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
    pub build_tag: String,
-    // connection string to pgbouncer to change settings
-    pub pgbouncer_connstr: Option<String>,
-    // path to pgbouncer.ini to change settings
-    pub pgbouncer_ini_path: Option<String>,
 }

 // store some metrics about download size that might impact startup time
@@ -704,13 +700,14 @@ impl ComputeNode {
        // In this case we need to connect with old `zenith_admin` name
        // and create new user. We cannot simply rename connected user,
        // but we can create a new one and grant it all privileges.
-        let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
+        let connstr = self.connstr.clone();
+        let mut client = match Client::connect(connstr.as_str(), NoTls) {
            Err(e) => {
                info!(
                    "cannot connect to postgres: {}, retrying with `zenith_admin` username",
                    e
                );
-                let mut zenith_admin_connstr = self.connstr.clone();
+                let mut zenith_admin_connstr = connstr.clone();

                zenith_admin_connstr
                    .set_username("zenith_admin")
@@ -723,8 +720,8 @@ impl ComputeNode {
                client.simple_query("GRANT zenith_admin TO cloud_admin")?;
                drop(client);

-                // reconnect with connsting with expected name
-                Client::connect(self.connstr.as_str(), NoTls)?
+                // reconnect with connstring with expected name
+                Client::connect(connstr.as_str(), NoTls)?
            }
            Ok(client) => client,
        };
@@ -738,8 +735,8 @@ impl ComputeNode {
        cleanup_instance(&mut client)?;
        handle_roles(spec, &mut client)?;
        handle_databases(spec, &mut client)?;
-        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
-        handle_grants(spec, &mut client, self.connstr.as_str())?;
+        handle_role_deletions(spec, connstr.as_str(), &mut client)?;
+        handle_grants(spec, &mut client, connstr.as_str())?;
        handle_extensions(spec, &mut client)?;
        handle_extension_neon(&mut client)?;
        create_availability_check_data(&mut client)?;
@@ -747,6 +744,12 @@ impl ComputeNode {
        // 'Close' connection
        drop(client);

+        if self.has_feature(ComputeFeature::Migrations) {
+            thread::spawn(move || {
+                let mut client = Client::connect(connstr.as_str(), NoTls)?;
+                handle_migrations(&mut client)
+            });
+        }
        Ok(())
    }

@@ -769,8 +772,8 @@ impl ComputeNode {
    pub fn reconfigure(&self) -> Result<()> {
        let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;

-        if let Some(connstr) = &self.pgbouncer_connstr {
-            info!("tuning pgbouncer with connstr: {:?}", connstr);
+        if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
+            info!("tuning pgbouncer");

            let rt = tokio::runtime::Builder::new_current_thread()
                .enable_all()
@@ -779,15 +782,9 @@ impl ComputeNode {

            // Spawn a thread to do the tuning,
            // so that we don't block the main thread that starts Postgres.
-            let pgbouncer_settings = spec.pgbouncer_settings.clone();
-            let connstr_clone = connstr.clone();
-            let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
+            let pgbouncer_settings = pgbouncer_settings.clone();
            let _handle = thread::spawn(move || {
-                let res = rt.block_on(tune_pgbouncer(
-                    pgbouncer_settings,
-                    &connstr_clone,
-                    pgbouncer_ini_path,
-                ));
+                let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
                if let Err(err) = res {
                    error!("error while tuning pgbouncer: {err:?}");
                }
@@ -817,6 +814,10 @@ impl ComputeNode {
            handle_grants(&spec, &mut client, self.connstr.as_str())?;
            handle_extensions(&spec, &mut client)?;
            handle_extension_neon(&mut client)?;
+            // We can skip handle_migrations here because a new migration can only appear
+            // if we have a new version of the compute_ctl binary, which can only happen
+            // if compute got restarted, in which case we'll end up inside of apply_config
+            // instead of reconfigure.
        }

        // 'Close' connection
@@ -852,8 +853,8 @@ impl ComputeNode {
        );

        // tune pgbouncer
-        if let Some(connstr) = &self.pgbouncer_connstr {
-            info!("tuning pgbouncer with connstr: {:?}", connstr);
+        if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings {
+            info!("tuning pgbouncer");

            let rt = tokio::runtime::Builder::new_current_thread()
                .enable_all()
@@ -862,15 +863,9 @@ impl ComputeNode {

            // Spawn a thread to do the tuning,
            // so that we don't block the main thread that starts Postgres.
-            let pgbouncer_settings = pspec.spec.pgbouncer_settings.clone();
-            let connstr_clone = connstr.clone();
-            let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
+            let pgbouncer_settings = pgbouncer_settings.clone();
            let _handle = thread::spawn(move || {
-                let res = rt.block_on(tune_pgbouncer(
-                    pgbouncer_settings,
-                    &connstr_clone,
-                    pgbouncer_ini_path,
-                ));
+                let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
                if let Err(err) = res {
                    error!("error while tuning pgbouncer: {err:?}");
                }
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -366,7 +366,7 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> {
 }

 /// Update pgbouncer.ini with provided options
-pub fn update_pgbouncer_ini(
+fn update_pgbouncer_ini(
    pgbouncer_config: HashMap<String, String>,
    pgbouncer_ini_path: &str,
 ) -> Result<()> {
@@ -375,6 +375,10 @@ pub fn update_pgbouncer_ini(

    for (option_name, value) in pgbouncer_config.iter() {
        section.insert(option_name, value);
+        debug!(
+            "Updating pgbouncer.ini with new values {}={}",
+            option_name, value
+        );
    }

    conf.write_to_file(pgbouncer_ini_path)?;
@@ -384,49 +388,80 @@ pub fn update_pgbouncer_ini(
 /// Tune pgbouncer.
 /// 1. Apply new config using pgbouncer admin console
 /// 2. Add new values to pgbouncer.ini to preserve them after restart
-pub async fn tune_pgbouncer(
-    pgbouncer_settings: Option<HashMap<String, String>>,
-    pgbouncer_connstr: &str,
-    pgbouncer_ini_path: Option<String>,
-) -> Result<()> {
-    if let Some(pgbouncer_config) = pgbouncer_settings {
-        // Apply new config
-        let connect_result = tokio_postgres::connect(pgbouncer_connstr, NoTls).await;
-        let (client, connection) = connect_result.unwrap();
-        tokio::spawn(async move {
-            if let Err(e) = connection.await {
-                eprintln!("connection error: {}", e);
+pub async fn tune_pgbouncer(pgbouncer_config: HashMap<String, String>) -> Result<()> {
+    let pgbouncer_connstr = if std::env::var_os("AUTOSCALING").is_some() {
+        // for VMs use pgbouncer specific way to connect to
+        // pgbouncer admin console without password
+        // when pgbouncer is running under the same user.
+        "host=/tmp port=6432 dbname=pgbouncer user=pgbouncer".to_string()
+    } else {
+        // for k8s use normal connection string with password
+        // to connect to pgbouncer admin console
+        let mut pgbouncer_connstr =
+            "host=localhost port=6432 dbname=pgbouncer user=postgres sslmode=disable".to_string();
+        if let Ok(pass) = std::env::var("PGBOUNCER_PASSWORD") {
+            pgbouncer_connstr.push_str(format!(" password={}", pass).as_str());
+        }
+        pgbouncer_connstr
+    };
+
+    info!(
+        "Connecting to pgbouncer with connection string: {}",
+        pgbouncer_connstr
+    );
+
+    // connect to pgbouncer, retrying several times
+    // because pgbouncer may not be ready yet
+    let mut retries = 3;
+    let client = loop {
+        match tokio_postgres::connect(&pgbouncer_connstr, NoTls).await {
+            Ok((client, connection)) => {
+                tokio::spawn(async move {
+                    if let Err(e) = connection.await {
+                        eprintln!("connection error: {}", e);
+                    }
+                });
+                break client;
            }
-        });
+            Err(e) => {
+                if retries == 0 {
+                    return Err(e.into());
+                }
+                error!("Failed to connect to pgbouncer: pgbouncer_connstr {}", e);
+                retries -= 1;
+                tokio::time::sleep(Duration::from_secs(1)).await;
+            }
+        }
+    };

-        for (option_name, value) in pgbouncer_config.iter() {
-            info!(
-                "Applying pgbouncer setting change: {} = {}",
-                option_name, value
+    // Apply new config
+    for (option_name, value) in pgbouncer_config.iter() {
+        let query = format!("SET {}={}", option_name, value);
+        // keep this log line for debugging purposes
+        info!("Applying pgbouncer setting change: {}", query);
+
+        if let Err(err) = client.simple_query(&query).await {
+            // Don't fail on error, just print it into log
+            error!(
+                "Failed to apply pgbouncer setting change: {},  {}",
+                query, err
            );
-            let query = format!("SET {} = {}", option_name, value);
-
-            let result = client.simple_query(&query).await;
-
-            info!("Applying pgbouncer setting change: {}", query);
-            info!("pgbouncer setting change result: {:?}", result);
-
-            if let Err(err) = result {
-                // Don't fail on error, just print it into log
-                error!(
-                    "Failed to apply pgbouncer setting change: {},  {}",
-                    query, err
-                );
-            };
-        }
-
-        // save values to pgbouncer.ini
-        // so that they are preserved after pgbouncer restart
-        if let Some(pgbouncer_ini_path) = pgbouncer_ini_path {
-            update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
-        }
+        };
    }

+    // save values to pgbouncer.ini
+    // so that they are preserved after pgbouncer restart
+    let pgbouncer_ini_path = if std::env::var_os("AUTOSCALING").is_some() {
+        // in VMs we use /etc/pgbouncer.ini
+        "/etc/pgbouncer.ini".to_string()
+    } else {
+        // in pods we use /var/db/postgres/pgbouncer/pgbouncer.ini
+        // this is a shared volume between pgbouncer and postgres containers
+        // FIXME: fix permissions for this file
+        "/var/db/postgres/pgbouncer/pgbouncer.ini".to_string()
+    };
+    update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
+
    Ok(())
 }

--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -727,3 +727,79 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {

    Ok(())
 }
+
+#[instrument(skip_all)]
+pub fn handle_migrations(client: &mut Client) -> Result<()> {
+    info!("handle migrations");
+
+    // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    // !BE SURE TO ONLY ADD MIGRATIONS TO THE END OF THIS ARRAY. IF YOU DO NOT, VERY VERY BAD THINGS MAY HAPPEN!
+    // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+    let migrations = [
+        "ALTER ROLE neon_superuser BYPASSRLS",
+        r#"
+DO $$
+DECLARE
+    role_name text;
+BEGIN
+    FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
+    END LOOP;
+
+    FOR role_name IN SELECT rolname FROM pg_roles
+        WHERE
+            NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
+    END LOOP;
+END $$;
+"#,
+    ];
+
+    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+    client.simple_query(query)?;
+
+    query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+    client.simple_query(query)?;
+
+    query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+    client.simple_query(query)?;
+
+    query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+    client.simple_query(query)?;
+
+    query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+    client.simple_query(query)?;
+
+    query = "SELECT id FROM neon_migration.migration_id";
+    let row = client.query_one(query, &[])?;
+    let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
+    let starting_migration_id = current_migration;
+
+    query = "BEGIN";
+    client.simple_query(query)?;
+
+    while current_migration < migrations.len() {
+        info!("Running migration:\n{}\n", migrations[current_migration]);
+        client.simple_query(migrations[current_migration])?;
+        current_migration += 1;
+    }
+    let setval = format!(
+        "UPDATE neon_migration.migration_id SET id={}",
+        migrations.len()
+    );
+    client.simple_query(&setval)?;
+
+    query = "COMMIT";
+    client.simple_query(query)?;
+
+    info!(
+        "Ran {} migrations",
+        (migrations.len() - starting_migration_id)
+    );
+    Ok(())
+}
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -0,0 +1,32 @@
+[package]
+name = "attachment_service"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+camino.workspace = true
+clap.workspace = true
+futures.workspace = true
+git-version.workspace = true
+hyper.workspace = true
+pageserver_api.workspace = true
+pageserver_client.workspace = true
+postgres_connection.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+thiserror.workspace = true
+tokio.workspace = true
+tokio-util.workspace = true
+tracing.workspace = true
+
+# TODO: remove this after DB persistence is added, it is only used for
+# a parsing function when loading pageservers from neon_local LocalEnv
+postgres_backend.workspace = true
+
+utils = { path = "../../libs/utils/" }
+metrics = { path = "../../libs/metrics/" }
+control_plane = { path = ".." }
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -0,0 +1,116 @@
+use std::collections::HashMap;
+
+use control_plane::endpoint::ComputeControlPlane;
+use control_plane::local_env::LocalEnv;
+use pageserver_api::shard::{ShardCount, ShardIndex, TenantShardId};
+use postgres_connection::parse_host_port;
+use utils::id::{NodeId, TenantId};
+
+pub(super) struct ComputeHookTenant {
+    shards: Vec<(ShardIndex, NodeId)>,
+}
+
+impl ComputeHookTenant {
+    pub(super) async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> anyhow::Result<()> {
+        // Find the highest shard count and drop any shards that aren't
+        // for that shard count.
+        let shard_count = self.shards.iter().map(|(k, _v)| k.shard_count).max();
+        let Some(shard_count) = shard_count else {
+            // No shards, nothing to do.
+            tracing::info!("ComputeHookTenant::maybe_reconfigure: no shards");
+            return Ok(());
+        };
+
+        self.shards.retain(|(k, _v)| k.shard_count == shard_count);
+        self.shards
+            .sort_by_key(|(shard, _node_id)| shard.shard_number);
+
+        if self.shards.len() == shard_count.0 as usize || shard_count == ShardCount(0) {
+            // We have pageservers for all the shards: proceed to reconfigure compute
+            let env = match LocalEnv::load_config() {
+                Ok(e) => e,
+                Err(e) => {
+                    tracing::warn!(
+                        "Couldn't load neon_local config, skipping compute update ({e})"
+                    );
+                    return Ok(());
+                }
+            };
+            let cplane = ComputeControlPlane::load(env.clone())
+                .expect("Error loading compute control plane");
+
+            let compute_pageservers = self
+                .shards
+                .iter()
+                .map(|(_shard, node_id)| {
+                    let ps_conf = env
+                        .get_pageserver_conf(*node_id)
+                        .expect("Unknown pageserver");
+                    let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr)
+                        .expect("Unable to parse listen_pg_addr");
+                    (pg_host, pg_port.unwrap_or(5432))
+                })
+                .collect::<Vec<_>>();
+
+            for (endpoint_name, endpoint) in &cplane.endpoints {
+                if endpoint.tenant_id == tenant_id && endpoint.status() == "running" {
+                    tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,);
+                    endpoint.reconfigure(compute_pageservers.clone()).await?;
+                }
+            }
+        } else {
+            tracing::info!(
+                "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
+                self.shards.len(),
+                shard_count.0
+            );
+        }
+
+        Ok(())
+    }
+}
+
+/// The compute hook is a destination for notifications about changes to tenant:pageserver
+/// mapping.  It aggregates updates for the shards in a tenant, and when appropriate reconfigures
+/// the compute connection string.
+pub(super) struct ComputeHook {
+    state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
+}
+
+impl ComputeHook {
+    pub(super) fn new() -> Self {
+        Self {
+            state: Default::default(),
+        }
+    }
+
+    pub(super) async fn notify(
+        &self,
+        tenant_shard_id: TenantShardId,
+        node_id: NodeId,
+    ) -> anyhow::Result<()> {
+        tracing::info!("ComputeHook::notify: {}->{}", tenant_shard_id, node_id);
+        let mut locked = self.state.lock().await;
+        let entry = locked
+            .entry(tenant_shard_id.tenant_id)
+            .or_insert_with(|| ComputeHookTenant { shards: Vec::new() });
+
+        let shard_index = ShardIndex {
+            shard_count: tenant_shard_id.shard_count,
+            shard_number: tenant_shard_id.shard_number,
+        };
+
+        let mut set = false;
+        for (existing_shard, existing_node) in &mut entry.shards {
+            if *existing_shard == shard_index {
+                *existing_node = node_id;
+                set = true;
+            }
+        }
+        if !set {
+            entry.shards.push((shard_index, node_id));
+        }
+
+        entry.maybe_reconfigure(tenant_shard_id.tenant_id).await
+    }
+}
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -0,0 +1,218 @@
+use crate::reconciler::ReconcileError;
+use crate::service::Service;
+use hyper::{Body, Request, Response};
+use hyper::{StatusCode, Uri};
+use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest};
+use pageserver_api::shard::TenantShardId;
+use std::sync::Arc;
+use utils::auth::SwappableJwtAuth;
+use utils::http::endpoint::{auth_middleware, request_span};
+use utils::http::request::parse_request_param;
+use utils::id::TenantId;
+
+use utils::{
+    http::{
+        endpoint::{self},
+        error::ApiError,
+        json::{json_request, json_response},
+        RequestExt, RouterBuilder,
+    },
+    id::NodeId,
+};
+
+use pageserver_api::control_api::{ReAttachRequest, ValidateRequest};
+
+use control_plane::attachment_service::{
+    AttachHookRequest, InspectRequest, NodeConfigureRequest, NodeRegisterRequest,
+    TenantShardMigrateRequest,
+};
+
+/// State available to HTTP request handlers
+#[derive(Clone)]
+pub struct HttpState {
+    service: Arc<crate::service::Service>,
+    auth: Option<Arc<SwappableJwtAuth>>,
+    allowlist_routes: Vec<Uri>,
+}
+
+impl HttpState {
+    pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
+        let allowlist_routes = ["/status"]
+            .iter()
+            .map(|v| v.parse().unwrap())
+            .collect::<Vec<_>>();
+        Self {
+            service,
+            auth,
+            allowlist_routes,
+        }
+    }
+}
+
+#[inline(always)]
+fn get_state(request: &Request<Body>) -> &HttpState {
+    request
+        .data::<Arc<HttpState>>()
+        .expect("unknown state type")
+        .as_ref()
+}
+
+/// Pageserver calls into this on startup, to learn which tenants it should attach
+async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
+    let state = get_state(&req);
+    json_response(
+        StatusCode::OK,
+        state
+            .service
+            .re_attach(reattach_req)
+            .await
+            .map_err(ApiError::InternalServerError)?,
+    )
+}
+
+/// Pageserver calls into this before doing deletions, to confirm that it still
+/// holds the latest generation for the tenants with deletions enqueued
+async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let validate_req = json_request::<ValidateRequest>(&mut req).await?;
+    let state = get_state(&req);
+    json_response(StatusCode::OK, state.service.validate(validate_req))
+}
+
+/// Call into this before attaching a tenant to a pageserver, to acquire a generation number
+/// (in the real control plane this is unnecessary, because the same program is managing
+///  generation numbers and doing attachments).
+async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
+    let state = get_state(&req);
+
+    json_response(
+        StatusCode::OK,
+        state
+            .service
+            .attach_hook(attach_req)
+            .await
+            .map_err(ApiError::InternalServerError)?,
+    )
+}
+
+async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let inspect_req = json_request::<InspectRequest>(&mut req).await?;
+
+    let state = get_state(&req);
+
+    json_response(StatusCode::OK, state.service.inspect(inspect_req))
+}
+
+async fn handle_tenant_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
+    let state = get_state(&req);
+    json_response(
+        StatusCode::OK,
+        state.service.tenant_create(create_req).await?,
+    )
+}
+
+async fn handle_tenant_timeline_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
+
+    let state = get_state(&req);
+    json_response(
+        StatusCode::OK,
+        state
+            .service
+            .tenant_timeline_create(tenant_id, create_req)
+            .await?,
+    )
+}
+
+async fn handle_tenant_locate(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let state = get_state(&req);
+
+    json_response(StatusCode::OK, state.service.tenant_locate(tenant_id)?)
+}
+
+async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
+    let state = get_state(&req);
+    state.service.node_register(register_req).await?;
+    json_response(StatusCode::OK, ())
+}
+
+async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+    let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
+    if node_id != config_req.node_id {
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "Path and body node_id differ"
+        )));
+    }
+    let state = get_state(&req);
+
+    json_response(StatusCode::OK, state.service.node_configure(config_req)?)
+}
+
+async fn handle_tenant_shard_migrate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
+    let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
+    let state = get_state(&req);
+    json_response(
+        StatusCode::OK,
+        state
+            .service
+            .tenant_shard_migrate(tenant_shard_id, migrate_req)
+            .await?,
+    )
+}
+
+/// Status endpoint is just used for checking that our HTTP listener is up
+async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    json_response(StatusCode::OK, ())
+}
+
+impl From<ReconcileError> for ApiError {
+    fn from(value: ReconcileError) -> Self {
+        ApiError::Conflict(format!("Reconciliation error: {}", value))
+    }
+}
+
+pub fn make_router(
+    service: Arc<Service>,
+    auth: Option<Arc<SwappableJwtAuth>>,
+) -> RouterBuilder<hyper::Body, ApiError> {
+    let mut router = endpoint::make_router();
+    if auth.is_some() {
+        router = router.middleware(auth_middleware(|request| {
+            let state = get_state(request);
+            if state.allowlist_routes.contains(request.uri()) {
+                None
+            } else {
+                state.auth.as_deref()
+            }
+        }))
+    }
+
+    router
+        .data(Arc::new(HttpState::new(service, auth)))
+        .get("/status", |r| request_span(r, handle_status))
+        .post("/re-attach", |r| request_span(r, handle_re_attach))
+        .post("/validate", |r| request_span(r, handle_validate))
+        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
+        .post("/inspect", |r| request_span(r, handle_inspect))
+        .post("/node", |r| request_span(r, handle_node_register))
+        .put("/node/:node_id/config", |r| {
+            request_span(r, handle_node_configure)
+        })
+        .post("/tenant", |r| request_span(r, handle_tenant_create))
+        .post("/tenant/:tenant_id/timeline", |r| {
+            request_span(r, handle_tenant_timeline_create)
+        })
+        .get("/tenant/:tenant_id/locate", |r| {
+            request_span(r, handle_tenant_locate)
+        })
+        .put("/tenant/:tenant_shard_id/migrate", |r| {
+            request_span(r, handle_tenant_shard_migrate)
+        })
+}
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -0,0 +1,57 @@
+use serde::{Deserialize, Serialize};
+use utils::seqwait::MonotonicCounter;
+
+mod compute_hook;
+pub mod http;
+mod node;
+pub mod persistence;
+mod reconciler;
+mod scheduler;
+pub mod service;
+mod tenant_state;
+
+#[derive(Clone, Serialize, Deserialize)]
+enum PlacementPolicy {
+    /// Cheapest way to attach a tenant: just one pageserver, no secondary
+    Single,
+    /// Production-ready way to attach a tenant: one attached pageserver and
+    /// some number of secondaries.
+    Double(usize),
+}
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
+struct Sequence(u64);
+
+impl Sequence {
+    fn initial() -> Self {
+        Self(0)
+    }
+}
+
+impl std::fmt::Display for Sequence {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl MonotonicCounter<Sequence> for Sequence {
+    fn cnt_advance(&mut self, v: Sequence) {
+        assert!(*self <= v);
+        *self = v;
+    }
+    fn cnt_value(&self) -> Sequence {
+        *self
+    }
+}
+
+impl Sequence {
+    fn next(&self) -> Sequence {
+        Sequence(self.0 + 1)
+    }
+}
+
+impl Default for PlacementPolicy {
+    fn default() -> Self {
+        PlacementPolicy::Double(1)
+    }
+}
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -0,0 +1,100 @@
+/// The attachment service mimics the aspects of the control plane API
+/// that are required for a pageserver to operate.
+///
+/// This enables running & testing pageservers without a full-blown
+/// deployment of the Neon cloud platform.
+///
+use anyhow::anyhow;
+use attachment_service::http::make_router;
+use attachment_service::persistence::Persistence;
+use attachment_service::service::{Config, Service};
+use camino::Utf8PathBuf;
+use clap::Parser;
+use metrics::launch_timestamp::LaunchTimestamp;
+use std::sync::Arc;
+use utils::auth::{JwtAuth, SwappableJwtAuth};
+use utils::logging::{self, LogFormat};
+use utils::signals::{ShutdownSignals, Signal};
+
+use utils::{project_build_tag, project_git_version, tcp_listener};
+
+project_git_version!(GIT_VERSION);
+project_build_tag!(BUILD_TAG);
+
+#[derive(Parser)]
+#[command(author, version, about, long_about = None)]
+#[command(arg_required_else_help(true))]
+struct Cli {
+    /// Host and port to listen on, like `127.0.0.1:1234`
+    #[arg(short, long)]
+    listen: std::net::SocketAddr,
+
+    /// Path to public key for JWT authentication of clients
+    #[arg(long)]
+    public_key: Option<camino::Utf8PathBuf>,
+
+    /// Token for authenticating this service with the pageservers it controls
+    #[arg(short, long)]
+    jwt_token: Option<String>,
+
+    /// Path to the .json file to store state (will be created if it doesn't exist)
+    #[arg(short, long)]
+    path: Utf8PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate()));
+
+    logging::init(
+        LogFormat::Plain,
+        logging::TracingErrorLayerEnablement::Disabled,
+        logging::Output::Stdout,
+    )?;
+
+    let args = Cli::parse();
+    tracing::info!(
+        "version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}",
+        GIT_VERSION,
+        launch_ts.to_string(),
+        BUILD_TAG,
+        args.path,
+        args.listen
+    );
+
+    let config = Config {
+        jwt_token: args.jwt_token,
+    };
+
+    let persistence = Arc::new(Persistence::new(&args.path).await);
+
+    let service = Service::spawn(config, persistence).await?;
+
+    let http_listener = tcp_listener::bind(args.listen)?;
+
+    let auth = if let Some(public_key_path) = &args.public_key {
+        let jwt_auth = JwtAuth::from_key_path(public_key_path)?;
+        Some(Arc::new(SwappableJwtAuth::new(jwt_auth)))
+    } else {
+        None
+    };
+    let router = make_router(service, auth)
+        .build()
+        .map_err(|err| anyhow!(err))?;
+    let service = utils::http::RouterService::new(router).unwrap();
+    let server = hyper::Server::from_tcp(http_listener)?.serve(service);
+
+    tracing::info!("Serving on {0}", args.listen);
+
+    tokio::task::spawn(server);
+
+    ShutdownSignals::handle(|signal| match signal {
+        Signal::Interrupt | Signal::Terminate | Signal::Quit => {
+            tracing::info!("Got {}. Terminating", signal.name());
+            // We're just a test helper: no graceful shutdown.
+            std::process::exit(0);
+        }
+    })?;
+
+    Ok(())
+}
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -0,0 +1,37 @@
+use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+use utils::id::NodeId;
+
+#[derive(Clone)]
+pub(crate) struct Node {
+    pub(crate) id: NodeId,
+
+    pub(crate) availability: NodeAvailability,
+    pub(crate) scheduling: NodeSchedulingPolicy,
+
+    pub(crate) listen_http_addr: String,
+    pub(crate) listen_http_port: u16,
+
+    pub(crate) listen_pg_addr: String,
+    pub(crate) listen_pg_port: u16,
+}
+
+impl Node {
+    pub(crate) fn base_url(&self) -> String {
+        format!("http://{}:{}", self.listen_http_addr, self.listen_http_port)
+    }
+
+    /// Is this node elegible to have work scheduled onto it?
+    pub(crate) fn may_schedule(&self) -> bool {
+        match self.availability {
+            NodeAvailability::Active => {}
+            NodeAvailability::Offline => return false,
+        }
+
+        match self.scheduling {
+            NodeSchedulingPolicy::Active => true,
+            NodeSchedulingPolicy::Draining => false,
+            NodeSchedulingPolicy::Filling => true,
+            NodeSchedulingPolicy::Pause => false,
+        }
+    }
+}
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -0,0 +1,280 @@
+use std::{collections::HashMap, str::FromStr};
+
+use camino::{Utf8Path, Utf8PathBuf};
+use control_plane::{
+    attachment_service::{NodeAvailability, NodeSchedulingPolicy},
+    local_env::LocalEnv,
+};
+use pageserver_api::{
+    models::TenantConfig,
+    shard::{ShardCount, ShardNumber, TenantShardId},
+};
+use postgres_connection::parse_host_port;
+use serde::{Deserialize, Serialize};
+use utils::{
+    generation::Generation,
+    id::{NodeId, TenantId},
+};
+
+use crate::{node::Node, PlacementPolicy};
+
+/// Placeholder for storage.  This will be replaced with a database client.
+pub struct Persistence {
+    state: std::sync::Mutex<PersistentState>,
+}
+
+// Top level state available to all HTTP handlers
+#[derive(Serialize, Deserialize)]
+struct PersistentState {
+    tenants: HashMap<TenantShardId, TenantShardPersistence>,
+
+    #[serde(skip)]
+    path: Utf8PathBuf,
+}
+
+/// A convenience for serializing the state inside a sync lock, and then
+/// writing it to disk outside of the lock.  This will go away when switching
+/// to a database backend.
+struct PendingWrite {
+    bytes: Vec<u8>,
+    path: Utf8PathBuf,
+}
+
+impl PendingWrite {
+    async fn commit(&self) -> anyhow::Result<()> {
+        tokio::fs::write(&self.path, &self.bytes).await?;
+
+        Ok(())
+    }
+}
+
+impl PersistentState {
+    fn save(&self) -> PendingWrite {
+        PendingWrite {
+            bytes: serde_json::to_vec(self).expect("Serialization error"),
+            path: self.path.clone(),
+        }
+    }
+
+    async fn load(path: &Utf8Path) -> anyhow::Result<Self> {
+        let bytes = tokio::fs::read(path).await?;
+        let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
+        decoded.path = path.to_owned();
+
+        for (tenant_id, tenant) in &mut decoded.tenants {
+            // Backward compat: an old attachments.json from before PR #6251, replace
+            // empty strings with proper defaults.
+            if tenant.tenant_id.is_empty() {
+                tenant.tenant_id = format!("{}", tenant_id);
+                tenant.config = serde_json::to_string(&TenantConfig::default())?;
+                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())?;
+            }
+        }
+
+        Ok(decoded)
+    }
+
+    async fn load_or_new(path: &Utf8Path) -> Self {
+        match Self::load(path).await {
+            Ok(s) => {
+                tracing::info!("Loaded state file at {}", path);
+                s
+            }
+            Err(e)
+                if e.downcast_ref::<std::io::Error>()
+                    .map(|e| e.kind() == std::io::ErrorKind::NotFound)
+                    .unwrap_or(false) =>
+            {
+                tracing::info!("Will create state file at {}", path);
+                Self {
+                    tenants: HashMap::new(),
+                    path: path.to_owned(),
+                }
+            }
+            Err(e) => {
+                panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path)
+            }
+        }
+    }
+}
+
+impl Persistence {
+    pub async fn new(path: &Utf8Path) -> Self {
+        let state = PersistentState::load_or_new(path).await;
+        Self {
+            state: std::sync::Mutex::new(state),
+        }
+    }
+
+    /// When registering a node, persist it so that on next start we will be able to
+    /// iterate over known nodes to synchronize their tenant shard states with our observed state.
+    pub(crate) async fn insert_node(&self, _node: &Node) -> anyhow::Result<()> {
+        // TODO: node persitence will come with database backend
+        Ok(())
+    }
+
+    /// At startup, we populate the service's list of nodes, and use this list to call into
+    /// each node to do an initial reconciliation of the state of the world with our in-memory
+    /// observed state.
+    pub(crate) async fn list_nodes(&self) -> anyhow::Result<Vec<Node>> {
+        let env = LocalEnv::load_config()?;
+        // TODO: node persitence will come with database backend
+
+        // XXX hack: enable test_backward_compatibility to work by populating our list of
+        // nodes from LocalEnv when it is not present in persistent storage.  Otherwise at
+        // first startup in the compat test, we may have shards but no nodes.
+        let mut result = Vec::new();
+        tracing::info!(
+            "Loaded {} pageserver nodes from LocalEnv",
+            env.pageservers.len()
+        );
+        for ps_conf in env.pageservers {
+            let (pg_host, pg_port) =
+                parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
+            let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
+                .expect("Unable to parse listen_http_addr");
+            result.push(Node {
+                id: ps_conf.id,
+                listen_pg_addr: pg_host.to_string(),
+                listen_pg_port: pg_port.unwrap_or(5432),
+                listen_http_addr: http_host.to_string(),
+                listen_http_port: http_port.unwrap_or(80),
+                availability: NodeAvailability::Active,
+                scheduling: NodeSchedulingPolicy::Active,
+            });
+        }
+
+        Ok(result)
+    }
+
+    /// At startup, we populate our map of tenant shards from persistent storage.
+    pub(crate) async fn list_tenant_shards(&self) -> anyhow::Result<Vec<TenantShardPersistence>> {
+        let locked = self.state.lock().unwrap();
+        Ok(locked.tenants.values().cloned().collect())
+    }
+
+    /// Tenants must be persisted before we schedule them for the first time.  This enables us
+    /// to correctly retain generation monotonicity, and the externally provided placement policy & config.
+    pub(crate) async fn insert_tenant_shards(
+        &self,
+        shards: Vec<TenantShardPersistence>,
+    ) -> anyhow::Result<()> {
+        let write = {
+            let mut locked = self.state.lock().unwrap();
+            for shard in shards {
+                let tenant_shard_id = TenantShardId {
+                    tenant_id: TenantId::from_str(shard.tenant_id.as_str())?,
+                    shard_number: ShardNumber(shard.shard_number as u8),
+                    shard_count: ShardCount(shard.shard_count as u8),
+                };
+
+                locked.tenants.insert(tenant_shard_id, shard);
+            }
+            locked.save()
+        };
+
+        write.commit().await?;
+
+        Ok(())
+    }
+
+    /// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically
+    /// advancing generation number.  We also store the NodeId for which the generation was issued, so that in
+    /// [`Self::re_attach`] we can do a bulk UPDATE on the generations for that node.
+    pub(crate) async fn increment_generation(
+        &self,
+        tenant_shard_id: TenantShardId,
+        node_id: NodeId,
+    ) -> anyhow::Result<Generation> {
+        let (write, gen) = {
+            let mut locked = self.state.lock().unwrap();
+            let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
+                anyhow::bail!("Tried to increment generation of unknown shard");
+            };
+
+            shard.generation += 1;
+            shard.generation_pageserver = Some(node_id);
+
+            let gen = Generation::new(shard.generation);
+            (locked.save(), gen)
+        };
+
+        write.commit().await?;
+        Ok(gen)
+    }
+
+    pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
+        let write = {
+            let mut locked = self.state.lock().unwrap();
+            let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
+                anyhow::bail!("Tried to increment generation of unknown shard");
+            };
+            shard.generation_pageserver = None;
+            locked.save()
+        };
+        write.commit().await?;
+        Ok(())
+    }
+
+    pub(crate) async fn re_attach(
+        &self,
+        node_id: NodeId,
+    ) -> anyhow::Result<HashMap<TenantShardId, Generation>> {
+        let (write, result) = {
+            let mut result = HashMap::new();
+            let mut locked = self.state.lock().unwrap();
+            for (tenant_shard_id, shard) in locked.tenants.iter_mut() {
+                if shard.generation_pageserver == Some(node_id) {
+                    shard.generation += 1;
+                    result.insert(*tenant_shard_id, Generation::new(shard.generation));
+                }
+            }
+
+            (locked.save(), result)
+        };
+
+        write.commit().await?;
+        Ok(result)
+    }
+
+    // TODO: when we start shard splitting, we must durably mark the tenant so that
+    // on restart, we know that we must go through recovery (list shards that exist
+    // and pick up where we left off and/or revert to parent shards).
+    #[allow(dead_code)]
+    pub(crate) async fn begin_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
+        todo!();
+    }
+
+    // TODO: when we finish shard splitting, we must atomically clean up the old shards
+    // and insert the new shards, and clear the splitting marker.
+    #[allow(dead_code)]
+    pub(crate) async fn complete_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
+        todo!();
+    }
+}
+
+/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
+#[derive(Serialize, Deserialize, Clone)]
+pub(crate) struct TenantShardPersistence {
+    #[serde(default)]
+    pub(crate) tenant_id: String,
+    #[serde(default)]
+    pub(crate) shard_number: i32,
+    #[serde(default)]
+    pub(crate) shard_count: i32,
+    #[serde(default)]
+    pub(crate) shard_stripe_size: i32,
+
+    // Currently attached pageserver
+    #[serde(rename = "pageserver")]
+    pub(crate) generation_pageserver: Option<NodeId>,
+
+    // Latest generation number: next time we attach, increment this
+    // and use the incremented number when attaching
+    pub(crate) generation: u32,
+
+    #[serde(default)]
+    pub(crate) placement_policy: String,
+    #[serde(default)]
+    pub(crate) config: String,
+}
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -0,0 +1,495 @@
+use crate::persistence::Persistence;
+use crate::service;
+use control_plane::attachment_service::NodeAvailability;
+use pageserver_api::models::{
+    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
+};
+use pageserver_api::shard::{ShardIdentity, TenantShardId};
+use pageserver_client::mgmt_api;
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::Duration;
+use tokio_util::sync::CancellationToken;
+use utils::generation::Generation;
+use utils::id::{NodeId, TimelineId};
+use utils::lsn::Lsn;
+
+use crate::compute_hook::ComputeHook;
+use crate::node::Node;
+use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
+
+/// Object with the lifetime of the background reconcile task that is created
+/// for tenants which have a difference between their intent and observed states.
+pub(super) struct Reconciler {
+    /// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot
+    /// of a tenant's state from when we spawned a reconcile task.
+    pub(super) tenant_shard_id: TenantShardId,
+    pub(crate) shard: ShardIdentity,
+    pub(crate) generation: Generation,
+    pub(crate) intent: IntentState,
+    pub(crate) config: TenantConfig,
+    pub(crate) observed: ObservedState,
+
+    pub(crate) service_config: service::Config,
+
+    /// A snapshot of the pageservers as they were when we were asked
+    /// to reconcile.
+    pub(crate) pageservers: Arc<HashMap<NodeId, Node>>,
+
+    /// A hook to notify the running postgres instances when we change the location
+    /// of a tenant
+    pub(crate) compute_hook: Arc<ComputeHook>,
+
+    /// A means to abort background reconciliation: it is essential to
+    /// call this when something changes in the original TenantState that
+    /// will make this reconciliation impossible or unnecessary, for
+    /// example when a pageserver node goes offline, or the PlacementPolicy for
+    /// the tenant is changed.
+    pub(crate) cancel: CancellationToken,
+
+    /// Access to persistent storage for updating generation numbers
+    pub(crate) persistence: Arc<Persistence>,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum ReconcileError {
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl Reconciler {
+    async fn location_config(
+        &mut self,
+        node_id: NodeId,
+        config: LocationConfig,
+        flush_ms: Option<Duration>,
+    ) -> anyhow::Result<()> {
+        let node = self
+            .pageservers
+            .get(&node_id)
+            .expect("Pageserver may not be removed while referenced");
+
+        self.observed
+            .locations
+            .insert(node.id, ObservedStateLocation { conf: None });
+
+        tracing::info!("location_config({}) calling: {:?}", node_id, config);
+        let client =
+            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
+        client
+            .location_config(self.tenant_shard_id, config.clone(), flush_ms)
+            .await?;
+        tracing::info!("location_config({}) complete: {:?}", node_id, config);
+
+        self.observed
+            .locations
+            .insert(node.id, ObservedStateLocation { conf: Some(config) });
+
+        Ok(())
+    }
+
+    async fn maybe_live_migrate(&mut self) -> Result<(), ReconcileError> {
+        let destination = if let Some(node_id) = self.intent.attached {
+            match self.observed.locations.get(&node_id) {
+                Some(conf) => {
+                    // We will do a live migration only if the intended destination is not
+                    // currently in an attached state.
+                    match &conf.conf {
+                        Some(conf) if conf.mode == LocationConfigMode::Secondary => {
+                            // Fall through to do a live migration
+                            node_id
+                        }
+                        None | Some(_) => {
+                            // Attached or uncertain: don't do a live migration, proceed
+                            // with a general-case reconciliation
+                            tracing::info!("maybe_live_migrate: destination is None or attached");
+                            return Ok(());
+                        }
+                    }
+                }
+                None => {
+                    // Our destination is not attached: maybe live migrate if some other
+                    // node is currently attached.  Fall through.
+                    node_id
+                }
+            }
+        } else {
+            // No intent to be attached
+            tracing::info!("maybe_live_migrate: no attached intent");
+            return Ok(());
+        };
+
+        let mut origin = None;
+        for (node_id, state) in &self.observed.locations {
+            if let Some(observed_conf) = &state.conf {
+                if observed_conf.mode == LocationConfigMode::AttachedSingle {
+                    let node = self
+                        .pageservers
+                        .get(node_id)
+                        .expect("Nodes may not be removed while referenced");
+                    // We will only attempt live migration if the origin is not offline: this
+                    // avoids trying to do it while reconciling after responding to an HA failover.
+                    if !matches!(node.availability, NodeAvailability::Offline) {
+                        origin = Some(*node_id);
+                        break;
+                    }
+                }
+            }
+        }
+
+        let Some(origin) = origin else {
+            tracing::info!("maybe_live_migrate: no origin found");
+            return Ok(());
+        };
+
+        // We have an origin and a destination: proceed to do the live migration
+        tracing::info!("Live migrating {}->{}", origin, destination);
+        self.live_migrate(origin, destination).await?;
+
+        Ok(())
+    }
+
+    async fn get_lsns(
+        &self,
+        tenant_shard_id: TenantShardId,
+        node_id: &NodeId,
+    ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
+        let node = self
+            .pageservers
+            .get(node_id)
+            .expect("Pageserver may not be removed while referenced");
+
+        let client =
+            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
+
+        let timelines = client.timeline_list(&tenant_shard_id).await?;
+        Ok(timelines
+            .into_iter()
+            .map(|t| (t.timeline_id, t.last_record_lsn))
+            .collect())
+    }
+
+    async fn secondary_download(&self, tenant_shard_id: TenantShardId, node_id: &NodeId) {
+        let node = self
+            .pageservers
+            .get(node_id)
+            .expect("Pageserver may not be removed while referenced");
+
+        let client =
+            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
+
+        match client.tenant_secondary_download(tenant_shard_id).await {
+            Ok(()) => {}
+            Err(_) => {
+                tracing::info!("  (skipping, destination wasn't in secondary mode)")
+            }
+        }
+    }
+
+    async fn await_lsn(
+        &self,
+        tenant_shard_id: TenantShardId,
+        pageserver_id: &NodeId,
+        baseline: HashMap<TimelineId, Lsn>,
+    ) -> anyhow::Result<()> {
+        loop {
+            let latest = match self.get_lsns(tenant_shard_id, pageserver_id).await {
+                Ok(l) => l,
+                Err(e) => {
+                    println!(
+                        "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
+                        pageserver_id
+                    );
+                    std::thread::sleep(Duration::from_millis(500));
+                    continue;
+                }
+            };
+
+            let mut any_behind: bool = false;
+            for (timeline_id, baseline_lsn) in &baseline {
+                match latest.get(timeline_id) {
+                    Some(latest_lsn) => {
+                        println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
+                        if latest_lsn < baseline_lsn {
+                            any_behind = true;
+                        }
+                    }
+                    None => {
+                        // Expected timeline isn't yet visible on migration destination.
+                        // (IRL we would have to account for timeline deletion, but this
+                        //  is just test helper)
+                        any_behind = true;
+                    }
+                }
+            }
+
+            if !any_behind {
+                println!("✅ LSN caught up.  Proceeding...");
+                break;
+            } else {
+                std::thread::sleep(Duration::from_millis(500));
+            }
+        }
+
+        Ok(())
+    }
+
+    pub async fn live_migrate(
+        &mut self,
+        origin_ps_id: NodeId,
+        dest_ps_id: NodeId,
+    ) -> anyhow::Result<()> {
+        // `maybe_live_migrate` is responsibble for sanity of inputs
+        assert!(origin_ps_id != dest_ps_id);
+
+        fn build_location_config(
+            shard: &ShardIdentity,
+            config: &TenantConfig,
+            mode: LocationConfigMode,
+            generation: Option<Generation>,
+            secondary_conf: Option<LocationConfigSecondary>,
+        ) -> LocationConfig {
+            LocationConfig {
+                mode,
+                generation: generation.map(|g| g.into().unwrap()),
+                secondary_conf,
+                tenant_conf: config.clone(),
+                shard_number: shard.number.0,
+                shard_count: shard.count.0,
+                shard_stripe_size: shard.stripe_size.0,
+            }
+        }
+
+        tracing::info!(
+            "🔁 Switching origin pageserver {} to stale mode",
+            origin_ps_id
+        );
+
+        // FIXME: it is incorrect to use self.generation here, we should use the generation
+        // from the ObservedState of the origin pageserver (it might be older than self.generation)
+        let stale_conf = build_location_config(
+            &self.shard,
+            &self.config,
+            LocationConfigMode::AttachedStale,
+            Some(self.generation),
+            None,
+        );
+        self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
+            .await?;
+
+        let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps_id).await?);
+
+        // If we are migrating to a destination that has a secondary location, warm it up first
+        if let Some(destination_conf) = self.observed.locations.get(&dest_ps_id) {
+            if let Some(destination_conf) = &destination_conf.conf {
+                if destination_conf.mode == LocationConfigMode::Secondary {
+                    tracing::info!(
+                        "🔁 Downloading latest layers to destination pageserver {}",
+                        dest_ps_id,
+                    );
+                    self.secondary_download(self.tenant_shard_id, &dest_ps_id)
+                        .await;
+                }
+            }
+        }
+
+        // Increment generation before attaching to new pageserver
+        self.generation = self
+            .persistence
+            .increment_generation(self.tenant_shard_id, dest_ps_id)
+            .await?;
+
+        let dest_conf = build_location_config(
+            &self.shard,
+            &self.config,
+            LocationConfigMode::AttachedMulti,
+            Some(self.generation),
+            None,
+        );
+
+        tracing::info!("🔁 Attaching to pageserver {}", dest_ps_id);
+        self.location_config(dest_ps_id, dest_conf, None).await?;
+
+        if let Some(baseline) = baseline_lsns {
+            tracing::info!("🕑 Waiting for LSN to catch up...");
+            self.await_lsn(self.tenant_shard_id, &dest_ps_id, baseline)
+                .await?;
+        }
+
+        tracing::info!("🔁 Notifying compute to use pageserver {}", dest_ps_id);
+        self.compute_hook
+            .notify(self.tenant_shard_id, dest_ps_id)
+            .await?;
+
+        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Single, then
+        // this location will be deleted in the general case reconciliation that runs after this.
+        let origin_secondary_conf = build_location_config(
+            &self.shard,
+            &self.config,
+            LocationConfigMode::Secondary,
+            None,
+            Some(LocationConfigSecondary { warm: true }),
+        );
+        self.location_config(origin_ps_id, origin_secondary_conf.clone(), None)
+            .await?;
+        // TODO: we should also be setting the ObservedState on earlier API calls, in case we fail
+        // partway through.  In fact, all location conf API calls should be in a wrapper that sets
+        // the observed state to None, then runs, then sets it to what we wrote.
+        self.observed.locations.insert(
+            origin_ps_id,
+            ObservedStateLocation {
+                conf: Some(origin_secondary_conf),
+            },
+        );
+
+        println!(
+            "🔁 Switching to AttachedSingle mode on pageserver {}",
+            dest_ps_id
+        );
+        let dest_final_conf = build_location_config(
+            &self.shard,
+            &self.config,
+            LocationConfigMode::AttachedSingle,
+            Some(self.generation),
+            None,
+        );
+        self.location_config(dest_ps_id, dest_final_conf.clone(), None)
+            .await?;
+        self.observed.locations.insert(
+            dest_ps_id,
+            ObservedStateLocation {
+                conf: Some(dest_final_conf),
+            },
+        );
+
+        println!("✅ Migration complete");
+
+        Ok(())
+    }
+
+    /// Reconciling a tenant makes API calls to pageservers until the observed state
+    /// matches the intended state.
+    ///
+    /// First we apply special case handling (e.g. for live migrations), and then a
+    /// general case reconciliation where we walk through the intent by pageserver
+    /// and call out to the pageserver to apply the desired state.
+    pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> {
+        // TODO: if any of self.observed is None, call to remote pageservers
+        // to learn correct state.
+
+        // Special case: live migration
+        self.maybe_live_migrate().await?;
+
+        // If the attached pageserver is not attached, do so now.
+        if let Some(node_id) = self.intent.attached {
+            let mut wanted_conf =
+                attached_location_conf(self.generation, &self.shard, &self.config);
+            match self.observed.locations.get(&node_id) {
+                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
+                    // Nothing to do
+                    tracing::info!("Observed configuration already correct.")
+                }
+                _ => {
+                    // In all cases other than a matching observed configuration, we will
+                    // reconcile this location.  This includes locations with different configurations, as well
+                    // as locations with unknown (None) observed state.
+                    self.generation = self
+                        .persistence
+                        .increment_generation(self.tenant_shard_id, node_id)
+                        .await?;
+                    wanted_conf.generation = self.generation.into();
+                    tracing::info!("Observed configuration requires update.");
+                    self.location_config(node_id, wanted_conf, None).await?;
+                    if let Err(e) = self
+                        .compute_hook
+                        .notify(self.tenant_shard_id, node_id)
+                        .await
+                    {
+                        tracing::warn!(
+                            "Failed to notify compute of newly attached pageserver {node_id}: {e}"
+                        );
+                    }
+                }
+            }
+        }
+
+        // Configure secondary locations: if these were previously attached this
+        // implicitly downgrades them from attached to secondary.
+        let mut changes = Vec::new();
+        for node_id in &self.intent.secondary {
+            let wanted_conf = secondary_location_conf(&self.shard, &self.config);
+            match self.observed.locations.get(node_id) {
+                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
+                    // Nothing to do
+                    tracing::info!(%node_id, "Observed configuration already correct.")
+                }
+                _ => {
+                    // In all cases other than a matching observed configuration, we will
+                    // reconcile this location.
+                    tracing::info!(%node_id, "Observed configuration requires update.");
+                    changes.push((*node_id, wanted_conf))
+                }
+            }
+        }
+
+        // Detach any extraneous pageservers that are no longer referenced
+        // by our intent.
+        let all_pageservers = self.intent.all_pageservers();
+        for node_id in self.observed.locations.keys() {
+            if all_pageservers.contains(node_id) {
+                // We are only detaching pageservers that aren't used at all.
+                continue;
+            }
+
+            changes.push((
+                *node_id,
+                LocationConfig {
+                    mode: LocationConfigMode::Detached,
+                    generation: None,
+                    secondary_conf: None,
+                    shard_number: self.shard.number.0,
+                    shard_count: self.shard.count.0,
+                    shard_stripe_size: self.shard.stripe_size.0,
+                    tenant_conf: self.config.clone(),
+                },
+            ));
+        }
+
+        for (node_id, conf) in changes {
+            self.location_config(node_id, conf, None).await?;
+        }
+
+        Ok(())
+    }
+}
+
+pub(crate) fn attached_location_conf(
+    generation: Generation,
+    shard: &ShardIdentity,
+    config: &TenantConfig,
+) -> LocationConfig {
+    LocationConfig {
+        mode: LocationConfigMode::AttachedSingle,
+        generation: generation.into(),
+        secondary_conf: None,
+        shard_number: shard.number.0,
+        shard_count: shard.count.0,
+        shard_stripe_size: shard.stripe_size.0,
+        tenant_conf: config.clone(),
+    }
+}
+
+pub(crate) fn secondary_location_conf(
+    shard: &ShardIdentity,
+    config: &TenantConfig,
+) -> LocationConfig {
+    LocationConfig {
+        mode: LocationConfigMode::Secondary,
+        generation: None,
+        secondary_conf: Some(LocationConfigSecondary { warm: true }),
+        shard_number: shard.number.0,
+        shard_count: shard.count.0,
+        shard_stripe_size: shard.stripe_size.0,
+        tenant_conf: config.clone(),
+    }
+}
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -0,0 +1,89 @@
+use pageserver_api::shard::TenantShardId;
+use std::collections::{BTreeMap, HashMap};
+use utils::{http::error::ApiError, id::NodeId};
+
+use crate::{node::Node, tenant_state::TenantState};
+
+/// Scenarios in which we cannot find a suitable location for a tenant shard
+#[derive(thiserror::Error, Debug)]
+pub enum ScheduleError {
+    #[error("No pageservers found")]
+    NoPageservers,
+    #[error("No pageserver found matching constraint")]
+    ImpossibleConstraint,
+}
+
+impl From<ScheduleError> for ApiError {
+    fn from(value: ScheduleError) -> Self {
+        ApiError::Conflict(format!("Scheduling error: {}", value))
+    }
+}
+
+pub(crate) struct Scheduler {
+    tenant_counts: HashMap<NodeId, usize>,
+}
+
+impl Scheduler {
+    pub(crate) fn new(
+        tenants: &BTreeMap<TenantShardId, TenantState>,
+        nodes: &HashMap<NodeId, Node>,
+    ) -> Self {
+        let mut tenant_counts = HashMap::new();
+        for node_id in nodes.keys() {
+            tenant_counts.insert(*node_id, 0);
+        }
+
+        for tenant in tenants.values() {
+            if let Some(ps) = tenant.intent.attached {
+                let entry = tenant_counts.entry(ps).or_insert(0);
+                *entry += 1;
+            }
+        }
+
+        for (node_id, node) in nodes {
+            if !node.may_schedule() {
+                tenant_counts.remove(node_id);
+            }
+        }
+
+        Self { tenant_counts }
+    }
+
+    pub(crate) fn schedule_shard(
+        &mut self,
+        hard_exclude: &[NodeId],
+    ) -> Result<NodeId, ScheduleError> {
+        if self.tenant_counts.is_empty() {
+            return Err(ScheduleError::NoPageservers);
+        }
+
+        let mut tenant_counts: Vec<(NodeId, usize)> = self
+            .tenant_counts
+            .iter()
+            .filter_map(|(k, v)| {
+                if hard_exclude.contains(k) {
+                    None
+                } else {
+                    Some((*k, *v))
+                }
+            })
+            .collect();
+
+        // Sort by tenant count.  Nodes with the same tenant count are sorted by ID.
+        tenant_counts.sort_by_key(|i| (i.1, i.0));
+
+        if tenant_counts.is_empty() {
+            // After applying constraints, no pageservers were left
+            return Err(ScheduleError::ImpossibleConstraint);
+        }
+
+        for (node_id, count) in &tenant_counts {
+            tracing::info!("tenant_counts[{node_id}]={count}");
+        }
+
+        let node_id = tenant_counts.first().unwrap().0;
+        tracing::info!("scheduler selected node {node_id}");
+        *self.tenant_counts.get_mut(&node_id).unwrap() += 1;
+        Ok(node_id)
+    }
+}
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -0,0 +1,455 @@
+use std::{collections::HashMap, sync::Arc, time::Duration};
+
+use control_plane::attachment_service::NodeAvailability;
+use pageserver_api::{
+    models::{LocationConfig, LocationConfigMode, TenantConfig},
+    shard::{ShardIdentity, TenantShardId},
+};
+use tokio::task::JoinHandle;
+use tokio_util::sync::CancellationToken;
+use utils::{
+    generation::Generation,
+    id::NodeId,
+    seqwait::{SeqWait, SeqWaitError},
+};
+
+use crate::{
+    compute_hook::ComputeHook,
+    node::Node,
+    persistence::Persistence,
+    reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler},
+    scheduler::{ScheduleError, Scheduler},
+    service, PlacementPolicy, Sequence,
+};
+
+pub(crate) struct TenantState {
+    pub(crate) tenant_shard_id: TenantShardId,
+
+    pub(crate) shard: ShardIdentity,
+
+    // Runtime only: sequence used to coordinate when updating this object while
+    // with background reconcilers may be running.  A reconciler runs to a particular
+    // sequence.
+    pub(crate) sequence: Sequence,
+
+    // Latest generation number: next time we attach, increment this
+    // and use the incremented number when attaching
+    pub(crate) generation: Generation,
+
+    // High level description of how the tenant should be set up.  Provided
+    // externally.
+    pub(crate) policy: PlacementPolicy,
+
+    // Low level description of exactly which pageservers should fulfil
+    // which role.  Generated by `Self::schedule`.
+    pub(crate) intent: IntentState,
+
+    // Low level description of how the tenant is configured on pageservers:
+    // if this does not match `Self::intent` then the tenant needs reconciliation
+    // with `Self::reconcile`.
+    pub(crate) observed: ObservedState,
+
+    // Tenant configuration, passed through opaquely to the pageserver.  Identical
+    // for all shards in a tenant.
+    pub(crate) config: TenantConfig,
+
+    /// If a reconcile task is currently in flight, it may be joined here (it is
+    /// only safe to join if either the result has been received or the reconciler's
+    /// cancellation token has been fired)
+    pub(crate) reconciler: Option<ReconcilerHandle>,
+
+    /// Optionally wait for reconciliation to complete up to a particular
+    /// sequence number.
+    pub(crate) waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
+
+    /// Indicates sequence number for which we have encountered an error reconciling.  If
+    /// this advances ahead of [`Self::waiter`] then a reconciliation error has occurred,
+    /// and callers should stop waiting for `waiter` and propagate the error.
+    pub(crate) error_waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
+
+    /// The most recent error from a reconcile on this tenant
+    /// TODO: generalize to an array of recent events
+    /// TOOD: use a ArcSwap instead of mutex for faster reads?
+    pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
+}
+
+#[derive(Default, Clone, Debug)]
+pub(crate) struct IntentState {
+    pub(crate) attached: Option<NodeId>,
+    pub(crate) secondary: Vec<NodeId>,
+}
+
+#[derive(Default, Clone)]
+pub(crate) struct ObservedState {
+    pub(crate) locations: HashMap<NodeId, ObservedStateLocation>,
+}
+
+/// Our latest knowledge of how this tenant is configured in the outside world.
+///
+/// Meaning:
+///     * No instance of this type exists for a node: we are certain that we have nothing configured on that
+///       node for this shard.
+///     * Instance exists with conf==None: we *might* have some state on that node, but we don't know
+///       what it is (e.g. we failed partway through configuring it)
+///     * Instance exists with conf==Some: this tells us what we last successfully configured on this node,
+///       and that configuration will still be present unless something external interfered.
+#[derive(Clone)]
+pub(crate) struct ObservedStateLocation {
+    /// If None, it means we do not know the status of this shard's location on this node, but
+    /// we know that we might have some state on this node.
+    pub(crate) conf: Option<LocationConfig>,
+}
+pub(crate) struct ReconcilerWaiter {
+    // For observability purposes, remember the ID of the shard we're
+    // waiting for.
+    pub(crate) tenant_shard_id: TenantShardId,
+
+    seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
+    error_seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
+    error: std::sync::Arc<std::sync::Mutex<String>>,
+    seq: Sequence,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum ReconcileWaitError {
+    #[error("Timeout waiting for shard {0}")]
+    Timeout(TenantShardId),
+    #[error("shutting down")]
+    Shutdown,
+    #[error("Reconcile error on shard {0}: {1}")]
+    Failed(TenantShardId, String),
+}
+
+impl ReconcilerWaiter {
+    pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
+        tokio::select! {
+            result = self.seq_wait.wait_for_timeout(self.seq, timeout)=> {
+                result.map_err(|e| match e {
+                    SeqWaitError::Timeout => ReconcileWaitError::Timeout(self.tenant_shard_id),
+                    SeqWaitError::Shutdown => ReconcileWaitError::Shutdown
+                })?;
+            },
+            result = self.error_seq_wait.wait_for(self.seq) => {
+                result.map_err(|e| match e {
+                    SeqWaitError::Shutdown => ReconcileWaitError::Shutdown,
+                    SeqWaitError::Timeout => unreachable!()
+                })?;
+
+                return Err(ReconcileWaitError::Failed(self.tenant_shard_id, self.error.lock().unwrap().clone()))
+            }
+        }
+
+        Ok(())
+    }
+}
+
+/// Having spawned a reconciler task, the tenant shard's state will carry enough
+/// information to optionally cancel & await it later.
+pub(crate) struct ReconcilerHandle {
+    sequence: Sequence,
+    handle: JoinHandle<()>,
+    cancel: CancellationToken,
+}
+
+/// When a reconcile task completes, it sends this result object
+/// to be applied to the primary TenantState.
+pub(crate) struct ReconcileResult {
+    pub(crate) sequence: Sequence,
+    /// On errors, `observed` should be treated as an incompleted description
+    /// of state (i.e. any nodes present in the result should override nodes
+    /// present in the parent tenant state, but any unmentioned nodes should
+    /// not be removed from parent tenant state)
+    pub(crate) result: Result<(), ReconcileError>,
+
+    pub(crate) tenant_shard_id: TenantShardId,
+    pub(crate) generation: Generation,
+    pub(crate) observed: ObservedState,
+}
+
+impl IntentState {
+    pub(crate) fn new() -> Self {
+        Self {
+            attached: None,
+            secondary: vec![],
+        }
+    }
+    pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
+        let mut result = Vec::new();
+        if let Some(p) = self.attached {
+            result.push(p)
+        }
+
+        result.extend(self.secondary.iter().copied());
+
+        result
+    }
+
+    /// When a node goes offline, we update intents to avoid using it
+    /// as their attached pageserver.
+    ///
+    /// Returns true if a change was made
+    pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
+        if self.attached == Some(node_id) {
+            self.attached = None;
+            self.secondary.push(node_id);
+            true
+        } else {
+            false
+        }
+    }
+}
+
+impl ObservedState {
+    pub(crate) fn new() -> Self {
+        Self {
+            locations: HashMap::new(),
+        }
+    }
+}
+
+impl TenantState {
+    pub(crate) fn new(
+        tenant_shard_id: TenantShardId,
+        shard: ShardIdentity,
+        policy: PlacementPolicy,
+    ) -> Self {
+        Self {
+            tenant_shard_id,
+            policy,
+            intent: IntentState::default(),
+            generation: Generation::new(0),
+            shard,
+            observed: ObservedState::default(),
+            config: TenantConfig::default(),
+            reconciler: None,
+            sequence: Sequence(1),
+            waiter: Arc::new(SeqWait::new(Sequence(0))),
+            error_waiter: Arc::new(SeqWait::new(Sequence(0))),
+            last_error: Arc::default(),
+        }
+    }
+
+    /// For use on startup when learning state from pageservers: generate my [`IntentState`] from my
+    /// [`ObservedState`], even if it violates my [`PlacementPolicy`].  Call [`Self::schedule`] next,
+    /// to get an intent state that complies with placement policy.  The overall goal is to do scheduling
+    /// in a way that makes use of any configured locations that already exist in the outside world.
+    pub(crate) fn intent_from_observed(&mut self) {
+        // Choose an attached location by filtering observed locations, and then sorting to get the highest
+        // generation
+        let mut attached_locs = self
+            .observed
+            .locations
+            .iter()
+            .filter_map(|(node_id, l)| {
+                if let Some(conf) = &l.conf {
+                    if conf.mode == LocationConfigMode::AttachedMulti
+                        || conf.mode == LocationConfigMode::AttachedSingle
+                        || conf.mode == LocationConfigMode::AttachedStale
+                    {
+                        Some((node_id, conf.generation))
+                    } else {
+                        None
+                    }
+                } else {
+                    None
+                }
+            })
+            .collect::<Vec<_>>();
+
+        attached_locs.sort_by_key(|i| i.1);
+        if let Some((node_id, _gen)) = attached_locs.into_iter().last() {
+            self.intent.attached = Some(*node_id);
+        }
+
+        // All remaining observed locations generate secondary intents.  This includes None
+        // observations, as these may well have some local content on disk that is usable (this
+        // is an edge case that might occur if we restarted during a migration or other change)
+        self.observed.locations.keys().for_each(|node_id| {
+            if Some(*node_id) != self.intent.attached {
+                self.intent.secondary.push(*node_id);
+            }
+        });
+    }
+
+    pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
+        // TODO: before scheduling new nodes, check if any existing content in
+        // self.intent refers to pageservers that are offline, and pick other
+        // pageservers if so.
+
+        // Build the set of pageservers already in use by this tenant, to avoid scheduling
+        // more work on the same pageservers we're already using.
+        let mut used_pageservers = self.intent.all_pageservers();
+        let mut modified = false;
+
+        use PlacementPolicy::*;
+        match self.policy {
+            Single => {
+                // Should have exactly one attached, and zero secondaries
+                if self.intent.attached.is_none() {
+                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
+                    self.intent.attached = Some(node_id);
+                    used_pageservers.push(node_id);
+                    modified = true;
+                }
+                if !self.intent.secondary.is_empty() {
+                    self.intent.secondary.clear();
+                    modified = true;
+                }
+            }
+            Double(secondary_count) => {
+                // Should have exactly one attached, and N secondaries
+                if self.intent.attached.is_none() {
+                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
+                    self.intent.attached = Some(node_id);
+                    used_pageservers.push(node_id);
+                    modified = true;
+                }
+
+                while self.intent.secondary.len() < secondary_count {
+                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
+                    self.intent.secondary.push(node_id);
+                    used_pageservers.push(node_id);
+                    modified = true;
+                }
+            }
+        }
+
+        if modified {
+            self.sequence.0 += 1;
+        }
+
+        Ok(())
+    }
+
+    fn dirty(&self) -> bool {
+        if let Some(node_id) = self.intent.attached {
+            let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config);
+            match self.observed.locations.get(&node_id) {
+                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
+                Some(_) | None => {
+                    return true;
+                }
+            }
+        }
+
+        for node_id in &self.intent.secondary {
+            let wanted_conf = secondary_location_conf(&self.shard, &self.config);
+            match self.observed.locations.get(node_id) {
+                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
+                Some(_) | None => {
+                    return true;
+                }
+            }
+        }
+
+        false
+    }
+
+    pub(crate) fn maybe_reconcile(
+        &mut self,
+        result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
+        pageservers: &Arc<HashMap<NodeId, Node>>,
+        compute_hook: &Arc<ComputeHook>,
+        service_config: &service::Config,
+        persistence: &Arc<Persistence>,
+    ) -> Option<ReconcilerWaiter> {
+        // If there are any ambiguous observed states, and the nodes they refer to are available,
+        // we should reconcile to clean them up.
+        let mut dirty_observed = false;
+        for (node_id, observed_loc) in &self.observed.locations {
+            let node = pageservers
+                .get(node_id)
+                .expect("Nodes may not be removed while referenced");
+            if observed_loc.conf.is_none()
+                && !matches!(node.availability, NodeAvailability::Offline)
+            {
+                dirty_observed = true;
+                break;
+            }
+        }
+
+        if !self.dirty() && !dirty_observed {
+            tracing::info!("Not dirty, no reconciliation needed.");
+            return None;
+        }
+
+        // Reconcile already in flight for the current sequence?
+        if let Some(handle) = &self.reconciler {
+            if handle.sequence == self.sequence {
+                return Some(ReconcilerWaiter {
+                    tenant_shard_id: self.tenant_shard_id,
+                    seq_wait: self.waiter.clone(),
+                    error_seq_wait: self.error_waiter.clone(),
+                    error: self.last_error.clone(),
+                    seq: self.sequence,
+                });
+            }
+        }
+
+        // Reconcile in flight for a stale sequence?  Our sequence's task will wait for it before
+        // doing our sequence's work.
+        let old_handle = self.reconciler.take();
+
+        let cancel = CancellationToken::new();
+        let mut reconciler = Reconciler {
+            tenant_shard_id: self.tenant_shard_id,
+            shard: self.shard,
+            generation: self.generation,
+            intent: self.intent.clone(),
+            config: self.config.clone(),
+            observed: self.observed.clone(),
+            pageservers: pageservers.clone(),
+            compute_hook: compute_hook.clone(),
+            service_config: service_config.clone(),
+            cancel: cancel.clone(),
+            persistence: persistence.clone(),
+        };
+
+        let reconcile_seq = self.sequence;
+
+        tracing::info!("Spawning Reconciler for sequence {}", self.sequence);
+        let join_handle = tokio::task::spawn(async move {
+            // Wait for any previous reconcile task to complete before we start
+            if let Some(old_handle) = old_handle {
+                old_handle.cancel.cancel();
+                if let Err(e) = old_handle.handle.await {
+                    // We can't do much with this other than log it: the task is done, so
+                    // we may proceed with our work.
+                    tracing::error!("Unexpected join error waiting for reconcile task: {e}");
+                }
+            }
+
+            // Early check for cancellation before doing any work
+            // TODO: wrap all remote API operations in cancellation check
+            // as well.
+            if reconciler.cancel.is_cancelled() {
+                return;
+            }
+
+            let result = reconciler.reconcile().await;
+            result_tx
+                .send(ReconcileResult {
+                    sequence: reconcile_seq,
+                    result,
+                    tenant_shard_id: reconciler.tenant_shard_id,
+                    generation: reconciler.generation,
+                    observed: reconciler.observed,
+                })
+                .ok();
+        });
+
+        self.reconciler = Some(ReconcilerHandle {
+            sequence: self.sequence,
+            handle: join_handle,
+            cancel,
+        });
+
+        Some(ReconcilerWaiter {
+            tenant_shard_id: self.tenant_shard_id,
+            seq_wait: self.waiter.clone(),
+            error_seq_wait: self.error_waiter.clone(),
+            error: self.last_error.clone(),
+            seq: self.sequence,
+        })
+    }
+}
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -1,14 +1,27 @@
 use crate::{background_process, local_env::LocalEnv};
-use anyhow::anyhow;
 use camino::Utf8PathBuf;
-use serde::{Deserialize, Serialize};
-use std::{path::PathBuf, process::Child};
-use utils::id::{NodeId, TenantId};
+use hyper::Method;
+use pageserver_api::{
+    models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
+    shard::TenantShardId,
+};
+use pageserver_client::mgmt_api::ResponseErrorMessageExt;
+use postgres_backend::AuthType;
+use postgres_connection::parse_host_port;
+use serde::{de::DeserializeOwned, Deserialize, Serialize};
+use std::{path::PathBuf, process::Child, str::FromStr};
+use tracing::instrument;
+use utils::{
+    auth::{Claims, Scope},
+    id::{NodeId, TenantId},
+};

 pub struct AttachmentService {
    env: LocalEnv,
    listen: String,
    path: PathBuf,
+    jwt_token: Option<String>,
+    public_key_path: Option<Utf8PathBuf>,
    client: reqwest::Client,
 }

@@ -16,7 +29,7 @@ const COMMAND: &str = "attachment_service";

 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
-    pub tenant_id: TenantId,
+    pub tenant_shard_id: TenantShardId,
    pub node_id: Option<NodeId>,
 }

@@ -27,7 +40,7 @@ pub struct AttachHookResponse {

 #[derive(Serialize, Deserialize)]
 pub struct InspectRequest {
-    pub tenant_id: TenantId,
+    pub tenant_shard_id: TenantShardId,
 }

 #[derive(Serialize, Deserialize)]
@@ -35,6 +48,125 @@ pub struct InspectResponse {
    pub attachment: Option<(u32, NodeId)>,
 }

+#[derive(Serialize, Deserialize)]
+pub struct TenantCreateResponseShard {
+    pub node_id: NodeId,
+    pub generation: u32,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantCreateResponse {
+    pub shards: Vec<TenantCreateResponseShard>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct NodeRegisterRequest {
+    pub node_id: NodeId,
+
+    pub listen_pg_addr: String,
+    pub listen_pg_port: u16,
+
+    pub listen_http_addr: String,
+    pub listen_http_port: u16,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct NodeConfigureRequest {
+    pub node_id: NodeId,
+
+    pub availability: Option<NodeAvailability>,
+    pub scheduling: Option<NodeSchedulingPolicy>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantLocateResponseShard {
+    pub shard_id: TenantShardId,
+    pub node_id: NodeId,
+
+    pub listen_pg_addr: String,
+    pub listen_pg_port: u16,
+
+    pub listen_http_addr: String,
+    pub listen_http_port: u16,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantLocateResponse {
+    pub shards: Vec<TenantLocateResponseShard>,
+    pub shard_params: ShardParameters,
+}
+
+/// Explicitly migrating a particular shard is a low level operation
+/// TODO: higher level "Reschedule tenant" operation where the request
+/// specifies some constraints, e.g. asking it to get off particular node(s)
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantShardMigrateRequest {
+    pub tenant_shard_id: TenantShardId,
+    pub node_id: NodeId,
+}
+
+#[derive(Serialize, Deserialize, Clone, Copy)]
+pub enum NodeAvailability {
+    // Normal, happy state
+    Active,
+    // Offline: Tenants shouldn't try to attach here, but they may assume that their
+    // secondary locations on this node still exist.  Newly added nodes are in this
+    // state until we successfully contact them.
+    Offline,
+}
+
+impl FromStr for NodeAvailability {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self::Active),
+            "offline" => Ok(Self::Offline),
+            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
+        }
+    }
+}
+
+/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
+/// type needs to be defined with diesel traits in there.
+#[derive(Serialize, Deserialize, Clone, Copy)]
+pub enum NodeSchedulingPolicy {
+    Active,
+    Filling,
+    Pause,
+    Draining,
+}
+
+impl FromStr for NodeSchedulingPolicy {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self::Active),
+            "filling" => Ok(Self::Filling),
+            "pause" => Ok(Self::Pause),
+            "draining" => Ok(Self::Draining),
+            _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
+        }
+    }
+}
+
+impl From<NodeSchedulingPolicy> for String {
+    fn from(value: NodeSchedulingPolicy) -> String {
+        use NodeSchedulingPolicy::*;
+        match value {
+            Active => "active",
+            Filling => "filling",
+            Pause => "pause",
+            Draining => "draining",
+        }
+        .to_string()
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantShardMigrateResponse {}
+
 impl AttachmentService {
    pub fn from_env(env: &LocalEnv) -> Self {
        let path = env.base_data_dir.join("attachments.json");
@@ -49,10 +181,34 @@ impl AttachmentService {
            listen_url.port().unwrap()
        );

+        // Assume all pageservers have symmetric auth configuration: this service
+        // expects to use one JWT token to talk to all of them.
+        let ps_conf = env
+            .pageservers
+            .first()
+            .expect("Config is validated to contain at least one pageserver");
+        let (jwt_token, public_key_path) = match ps_conf.http_auth_type {
+            AuthType::Trust => (None, None),
+            AuthType::NeonJWT => {
+                let jwt_token = env
+                    .generate_auth_token(&Claims::new(None, Scope::PageServerApi))
+                    .unwrap();
+
+                // If pageserver auth is enabled, this implicitly enables auth for this service,
+                // using the same credentials.
+                let public_key_path =
+                    camino::Utf8PathBuf::try_from(env.base_data_dir.join("auth_public_key.pem"))
+                        .unwrap();
+                (Some(jwt_token), Some(public_key_path))
+            }
+        };
+
        Self {
            env: env.clone(),
            path,
            listen,
+            jwt_token,
+            public_key_path,
            client: reqwest::ClientBuilder::new()
                .build()
                .expect("Failed to construct http client"),
@@ -67,72 +223,199 @@ impl AttachmentService {
    pub async fn start(&self) -> anyhow::Result<Child> {
        let path_str = self.path.to_string_lossy();

-        background_process::start_process(
+        let mut args = vec!["-l", &self.listen, "-p", &path_str]
+            .into_iter()
+            .map(|s| s.to_string())
+            .collect::<Vec<_>>();
+        if let Some(jwt_token) = &self.jwt_token {
+            args.push(format!("--jwt-token={jwt_token}"));
+        }
+
+        if let Some(public_key_path) = &self.public_key_path {
+            args.push(format!("--public-key={public_key_path}"));
+        }
+
+        let result = background_process::start_process(
            COMMAND,
            &self.env.base_data_dir,
            &self.env.attachment_service_bin(),
-            ["-l", &self.listen, "-p", &path_str],
-            [],
+            args,
+            [(
+                "NEON_REPO_DIR".to_string(),
+                self.env.base_data_dir.to_string_lossy().to_string(),
+            )],
            background_process::InitialPidFile::Create(self.pid_file()),
-            // TODO: a real status check
-            || async move { anyhow::Ok(true) },
+            || async {
+                match self.status().await {
+                    Ok(_) => Ok(true),
+                    Err(_) => Ok(false),
+                }
+            },
        )
-        .await
+        .await;
+
+        for ps_conf in &self.env.pageservers {
+            let (pg_host, pg_port) =
+                parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
+            let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
+                .expect("Unable to parse listen_http_addr");
+            self.node_register(NodeRegisterRequest {
+                node_id: ps_conf.id,
+                listen_pg_addr: pg_host.to_string(),
+                listen_pg_port: pg_port.unwrap_or(5432),
+                listen_http_addr: http_host.to_string(),
+                listen_http_port: http_port.unwrap_or(80),
+            })
+            .await?;
+        }
+
+        result
    }

    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
        background_process::stop_process(immediate, COMMAND, &self.pid_file())
    }
-
-    /// Call into the attach_hook API, for use before handing out attachments to pageservers
-    pub async fn attach_hook(
+    /// Simple HTTP request wrapper for calling into attachment service
+    async fn dispatch<RQ, RS>(
        &self,
-        tenant_id: TenantId,
-        pageserver_id: NodeId,
-    ) -> anyhow::Result<Option<u32>> {
-        use hyper::StatusCode;
-
+        method: hyper::Method,
+        path: String,
+        body: Option<RQ>,
+    ) -> anyhow::Result<RS>
+    where
+        RQ: Serialize + Sized,
+        RS: DeserializeOwned + Sized,
+    {
        let url = self
            .env
            .control_plane_api
            .clone()
            .unwrap()
-            .join("attach-hook")
+            .join(&path)
            .unwrap();

+        let mut builder = self.client.request(method, url);
+        if let Some(body) = body {
+            builder = builder.json(&body)
+        }
+        if let Some(jwt_token) = &self.jwt_token {
+            builder = builder.header(
+                reqwest::header::AUTHORIZATION,
+                format!("Bearer {jwt_token}"),
+            );
+        }
+
+        let response = builder.send().await?;
+        let response = response.error_from_body().await?;
+
+        Ok(response
+            .json()
+            .await
+            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?)
+    }
+
+    /// Call into the attach_hook API, for use before handing out attachments to pageservers
+    #[instrument(skip(self))]
+    pub async fn attach_hook(
+        &self,
+        tenant_shard_id: TenantShardId,
+        pageserver_id: NodeId,
+    ) -> anyhow::Result<Option<u32>> {
        let request = AttachHookRequest {
-            tenant_id,
+            tenant_shard_id,
            node_id: Some(pageserver_id),
        };

-        let response = self.client.post(url).json(&request).send().await?;
-        if response.status() != StatusCode::OK {
-            return Err(anyhow!("Unexpected status {}", response.status()));
-        }
+        let response = self
+            .dispatch::<_, AttachHookResponse>(
+                Method::POST,
+                "attach-hook".to_string(),
+                Some(request),
+            )
+            .await?;

-        let response = response.json::<AttachHookResponse>().await?;
        Ok(response.gen)
    }

-    pub async fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
-        use hyper::StatusCode;
+    #[instrument(skip(self))]
+    pub async fn inspect(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> anyhow::Result<Option<(u32, NodeId)>> {
+        let request = InspectRequest { tenant_shard_id };

-        let url = self
-            .env
-            .control_plane_api
-            .clone()
-            .unwrap()
-            .join("inspect")
-            .unwrap();
+        let response = self
+            .dispatch::<_, InspectResponse>(Method::POST, "inspect".to_string(), Some(request))
+            .await?;

-        let request = InspectRequest { tenant_id };
-
-        let response = self.client.post(url).json(&request).send().await?;
-        if response.status() != StatusCode::OK {
-            return Err(anyhow!("Unexpected status {}", response.status()));
-        }
-
-        let response = response.json::<InspectResponse>().await?;
        Ok(response.attachment)
    }
+
+    #[instrument(skip(self))]
+    pub async fn tenant_create(
+        &self,
+        req: TenantCreateRequest,
+    ) -> anyhow::Result<TenantCreateResponse> {
+        self.dispatch(Method::POST, "tenant".to_string(), Some(req))
+            .await
+    }
+
+    #[instrument(skip(self))]
+    pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
+        self.dispatch::<(), _>(Method::GET, format!("tenant/{tenant_id}/locate"), None)
+            .await
+    }
+
+    #[instrument(skip(self))]
+    pub async fn tenant_migrate(
+        &self,
+        tenant_shard_id: TenantShardId,
+        node_id: NodeId,
+    ) -> anyhow::Result<TenantShardMigrateResponse> {
+        self.dispatch(
+            Method::PUT,
+            format!("tenant/{tenant_shard_id}/migrate"),
+            Some(TenantShardMigrateRequest {
+                tenant_shard_id,
+                node_id,
+            }),
+        )
+        .await
+    }
+
+    #[instrument(skip_all, fields(node_id=%req.node_id))]
+    pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
+        self.dispatch::<_, ()>(Method::POST, "node".to_string(), Some(req))
+            .await
+    }
+
+    #[instrument(skip_all, fields(node_id=%req.node_id))]
+    pub async fn node_configure(&self, req: NodeConfigureRequest) -> anyhow::Result<()> {
+        self.dispatch::<_, ()>(
+            Method::PUT,
+            format!("node/{}/config", req.node_id),
+            Some(req),
+        )
+        .await
+    }
+
+    #[instrument(skip(self))]
+    pub async fn status(&self) -> anyhow::Result<()> {
+        self.dispatch::<(), ()>(Method::GET, "status".to_string(), None)
+            .await
+    }
+
+    #[instrument(skip_all, fields(%tenant_id, timeline_id=%req.new_timeline_id))]
+    pub async fn tenant_timeline_create(
+        &self,
+        tenant_id: TenantId,
+        req: TimelineCreateRequest,
+    ) -> anyhow::Result<TimelineInfo> {
+        self.dispatch(
+            Method::POST,
+            format!("tenant/{tenant_id}/timeline"),
+            Some(req),
+        )
+        .await
+    }
 }
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -1,355 +0,0 @@
-/// The attachment service mimics the aspects of the control plane API
-/// that are required for a pageserver to operate.
-///
-/// This enables running & testing pageservers without a full-blown
-/// deployment of the Neon cloud platform.
-///
-use anyhow::anyhow;
-use clap::Parser;
-use hex::FromHex;
-use hyper::StatusCode;
-use hyper::{Body, Request, Response};
-use pageserver_api::shard::TenantShardId;
-use serde::{Deserialize, Serialize};
-use std::path::{Path, PathBuf};
-use std::{collections::HashMap, sync::Arc};
-use utils::http::endpoint::request_span;
-use utils::logging::{self, LogFormat};
-use utils::signals::{ShutdownSignals, Signal};
-
-use utils::{
-    http::{
-        endpoint::{self},
-        error::ApiError,
-        json::{json_request, json_response},
-        RequestExt, RouterBuilder,
-    },
-    id::{NodeId, TenantId},
-    tcp_listener,
-};
-
-use pageserver_api::control_api::{
-    ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse,
-    ValidateResponseTenant,
-};
-
-use control_plane::attachment_service::{
-    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
-};
-
-#[derive(Parser)]
-#[command(author, version, about, long_about = None)]
-#[command(arg_required_else_help(true))]
-struct Cli {
-    /// Host and port to listen on, like `127.0.0.1:1234`
-    #[arg(short, long)]
-    listen: std::net::SocketAddr,
-
-    /// Path to the .json file to store state (will be created if it doesn't exist)
-    #[arg(short, long)]
-    path: PathBuf,
-}
-
-// The persistent state of each Tenant
-#[derive(Serialize, Deserialize, Clone)]
-struct TenantState {
-    // Currently attached pageserver
-    pageserver: Option<NodeId>,
-
-    // Latest generation number: next time we attach, increment this
-    // and use the incremented number when attaching
-    generation: u32,
-}
-
-fn to_hex_map<S, V>(input: &HashMap<TenantId, V>, serializer: S) -> Result<S::Ok, S::Error>
-where
-    S: serde::Serializer,
-    V: Clone + Serialize,
-{
-    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));
-
-    transformed
-        .collect::<HashMap<String, V>>()
-        .serialize(serializer)
-}
-
-fn from_hex_map<'de, D, V>(deserializer: D) -> Result<HashMap<TenantId, V>, D::Error>
-where
-    D: serde::de::Deserializer<'de>,
-    V: Deserialize<'de>,
-{
-    let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
-    hex_map
-        .into_iter()
-        .map(|(k, v)| {
-            TenantId::from_hex(k)
-                .map(|k| (k, v))
-                .map_err(serde::de::Error::custom)
-        })
-        .collect()
-}
-
-// Top level state available to all HTTP handlers
-#[derive(Serialize, Deserialize)]
-struct PersistentState {
-    #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
-    tenants: HashMap<TenantId, TenantState>,
-
-    #[serde(skip)]
-    path: PathBuf,
-}
-
-impl PersistentState {
-    async fn save(&self) -> anyhow::Result<()> {
-        let bytes = serde_json::to_vec(self)?;
-        tokio::fs::write(&self.path, &bytes).await?;
-
-        Ok(())
-    }
-
-    async fn load(path: &Path) -> anyhow::Result<Self> {
-        let bytes = tokio::fs::read(path).await?;
-        let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
-        decoded.path = path.to_owned();
-        Ok(decoded)
-    }
-
-    async fn load_or_new(path: &Path) -> Self {
-        match Self::load(path).await {
-            Ok(s) => {
-                tracing::info!("Loaded state file at {}", path.display());
-                s
-            }
-            Err(e)
-                if e.downcast_ref::<std::io::Error>()
-                    .map(|e| e.kind() == std::io::ErrorKind::NotFound)
-                    .unwrap_or(false) =>
-            {
-                tracing::info!("Will create state file at {}", path.display());
-                Self {
-                    tenants: HashMap::new(),
-                    path: path.to_owned(),
-                }
-            }
-            Err(e) => {
-                panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path.display())
-            }
-        }
-    }
-}
-
-/// State available to HTTP request handlers
-#[derive(Clone)]
-struct State {
-    inner: Arc<tokio::sync::RwLock<PersistentState>>,
-}
-
-impl State {
-    fn new(persistent_state: PersistentState) -> State {
-        Self {
-            inner: Arc::new(tokio::sync::RwLock::new(persistent_state)),
-        }
-    }
-}
-
-#[inline(always)]
-fn get_state(request: &Request<Body>) -> &State {
-    request
-        .data::<Arc<State>>()
-        .expect("unknown state type")
-        .as_ref()
-}
-
-/// Pageserver calls into this on startup, to learn which tenants it should attach
-async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
-
-    let state = get_state(&req).inner.clone();
-    let mut locked = state.write().await;
-
-    let mut response = ReAttachResponse {
-        tenants: Vec::new(),
-    };
-    for (t, state) in &mut locked.tenants {
-        if state.pageserver == Some(reattach_req.node_id) {
-            state.generation += 1;
-            response.tenants.push(ReAttachResponseTenant {
-                // TODO(sharding): make this shard-aware
-                id: TenantShardId::unsharded(*t),
-                gen: state.generation,
-            });
-        }
-    }
-
-    locked.save().await.map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, response)
-}
-
-/// Pageserver calls into this before doing deletions, to confirm that it still
-/// holds the latest generation for the tenants with deletions enqueued
-async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let validate_req = json_request::<ValidateRequest>(&mut req).await?;
-
-    let locked = get_state(&req).inner.read().await;
-
-    let mut response = ValidateResponse {
-        tenants: Vec::new(),
-    };
-
-    for req_tenant in validate_req.tenants {
-        // TODO(sharding): make this shard-aware
-        if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) {
-            let valid = tenant_state.generation == req_tenant.gen;
-            tracing::info!(
-                "handle_validate: {}(gen {}): valid={valid} (latest {})",
-                req_tenant.id,
-                req_tenant.gen,
-                tenant_state.generation
-            );
-            response.tenants.push(ValidateResponseTenant {
-                id: req_tenant.id,
-                valid,
-            });
-        }
-    }
-
-    json_response(StatusCode::OK, response)
-}
-/// Call into this before attaching a tenant to a pageserver, to acquire a generation number
-/// (in the real control plane this is unnecessary, because the same program is managing
-///  generation numbers and doing attachments).
-async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
-
-    let state = get_state(&req).inner.clone();
-    let mut locked = state.write().await;
-
-    let tenant_state = locked
-        .tenants
-        .entry(attach_req.tenant_id)
-        .or_insert_with(|| TenantState {
-            pageserver: attach_req.node_id,
-            generation: 0,
-        });
-
-    if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
-        tenant_state.generation += 1;
-        tracing::info!(
-            tenant_id = %attach_req.tenant_id,
-            ps_id = %attaching_pageserver,
-            generation = %tenant_state.generation,
-            "issuing",
-        );
-    } else if let Some(ps_id) = tenant_state.pageserver {
-        tracing::info!(
-            tenant_id = %attach_req.tenant_id,
-            %ps_id,
-            generation = %tenant_state.generation,
-            "dropping",
-        );
-    } else {
-        tracing::info!(
-            tenant_id = %attach_req.tenant_id,
-            "no-op: tenant already has no pageserver");
-    }
-    tenant_state.pageserver = attach_req.node_id;
-    let generation = tenant_state.generation;
-
-    tracing::info!(
-        "handle_attach_hook: tenant {} set generation {}, pageserver {}",
-        attach_req.tenant_id,
-        tenant_state.generation,
-        attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
-    );
-
-    locked.save().await.map_err(ApiError::InternalServerError)?;
-
-    json_response(
-        StatusCode::OK,
-        AttachHookResponse {
-            gen: attach_req.node_id.map(|_| generation),
-        },
-    )
-}
-
-async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let inspect_req = json_request::<InspectRequest>(&mut req).await?;
-
-    let state = get_state(&req).inner.clone();
-    let locked = state.write().await;
-    let tenant_state = locked.tenants.get(&inspect_req.tenant_id);
-
-    json_response(
-        StatusCode::OK,
-        InspectResponse {
-            attachment: tenant_state.and_then(|s| s.pageserver.map(|ps| (s.generation, ps))),
-        },
-    )
-}
-
-async fn handle_tenant_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let inspect_req = json_request::<InspectRequest>(&mut req).await?;
-
-    let state = get_state(&req).inner.clone();
-    let locked = state.write().await;
-    let tenant_state = locked.tenants.get(&inspect_req.tenant_id);
-
-    json_response(
-        StatusCode::OK,
-        InspectResponse {
-            attachment: tenant_state.and_then(|s| s.pageserver.map(|ps| (s.generation, ps))),
-        },
-    )
-}
-
-fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
-    endpoint::make_router()
-        .data(Arc::new(State::new(persistent_state)))
-        .post("/re-attach", |r| request_span(r, handle_re_attach))
-        .post("/validate", |r| request_span(r, handle_validate))
-        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
-        .post("/inspect", |r| request_span(r, handle_inspect))
-        .post("/tenant/:tenant_id", |r| {
-            request_span(r, handle_tenant_create)
-        })
-}
-
-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
-    logging::init(
-        LogFormat::Plain,
-        logging::TracingErrorLayerEnablement::Disabled,
-        logging::Output::Stdout,
-    )?;
-
-    let args = Cli::parse();
-    tracing::info!(
-        "Starting, state at {}, listening on {}",
-        args.path.to_string_lossy(),
-        args.listen
-    );
-
-    let persistent_state = PersistentState::load_or_new(&args.path).await;
-
-    let http_listener = tcp_listener::bind(args.listen)?;
-    let router = make_router(persistent_state)
-        .build()
-        .map_err(|err| anyhow!(err))?;
-    let service = utils::http::RouterService::new(router).unwrap();
-    let server = hyper::Server::from_tcp(http_listener)?.serve(service);
-
-    tracing::info!("Serving on {0}", args.listen);
-
-    tokio::task::spawn(server);
-
-    ShutdownSignals::handle(|signal| match signal {
-        Signal::Interrupt | Signal::Terminate | Signal::Quit => {
-            tracing::info!("Got {}. Terminating", signal.name());
-            // We're just a test helper: no graceful shutdown.
-            std::process::exit(0);
-        }
-    })?;
-
-    Ok(())
-}
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,20 +8,24 @@
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
 use compute_api::spec::ComputeMode;
-use control_plane::attachment_service::AttachmentService;
+use control_plane::attachment_service::{
+    AttachmentService, NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
+};
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::{InitForceMode, LocalEnv};
 use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
-use control_plane::tenant_migration::migrate_tenant;
 use control_plane::{broker, local_env};
-use pageserver_api::models::TimelineInfo;
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::models::{
+    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
+};
+use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
 use pageserver_api::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
 };
 use postgres_backend::AuthType;
+use postgres_connection::parse_host_port;
 use safekeeper_api::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
@@ -31,6 +35,7 @@ use std::path::PathBuf;
 use std::process::exit;
 use std::str::FromStr;
 use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
+use url::Host;
 use utils::{
    auth::{Claims, Scope},
    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -277,10 +282,10 @@ fn print_timeline(
 /// Connects to the pageserver to query this information.
 async fn get_timeline_infos(
    env: &local_env::LocalEnv,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
 ) -> Result<HashMap<TimelineId, TimelineInfo>> {
    Ok(get_default_pageserver(env)
-        .timeline_list(&TenantShardId::unsharded(*tenant_id))
+        .timeline_list(tenant_shard_id)
        .await?
        .into_iter()
        .map(|timeline_info| (timeline_info.timeline_id, timeline_info))
@@ -298,6 +303,20 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R
    }
 }

+// Helper function to parse --tenant_id option, for commands that accept a shard suffix
+fn get_tenant_shard_id(
+    sub_match: &ArgMatches,
+    env: &local_env::LocalEnv,
+) -> anyhow::Result<TenantShardId> {
+    if let Some(tenant_id_from_arguments) = parse_tenant_shard_id(sub_match).transpose() {
+        tenant_id_from_arguments
+    } else if let Some(default_id) = env.default_tenant_id {
+        Ok(TenantShardId::unsharded(default_id))
+    } else {
+        anyhow::bail!("No tenant shard id. Use --tenant-id, or set a default tenant");
+    }
+}
+
 fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
    sub_match
        .get_one::<String>("tenant-id")
@@ -306,6 +325,14 @@ fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
        .context("Failed to parse tenant id from the argument string")
 }

+fn parse_tenant_shard_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantShardId>> {
+    sub_match
+        .get_one::<String>("tenant-id")
+        .map(|id_str| TenantShardId::from_str(id_str))
+        .transpose()
+        .context("Failed to parse tenant shard id from the argument string")
+}
+
 fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId>> {
    sub_match
        .get_one::<String>("timeline-id")
@@ -394,47 +421,68 @@ async fn handle_tenant(
        Some(("create", create_match)) => {
            let tenant_conf: HashMap<_, _> = create_match
                .get_many::<String>("config")
-                .map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
+                .map(|vals: clap::parser::ValuesRef<'_, String>| {
+                    vals.flat_map(|c| c.split_once(':')).collect()
+                })
                .unwrap_or_default();

+            let shard_count: u8 = create_match
+                .get_one::<u8>("shard-count")
+                .cloned()
+                .unwrap_or(0);
+
+            let shard_stripe_size: Option<u32> =
+                create_match.get_one::<u32>("shard-stripe-size").cloned();
+
+            let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
+
            // If tenant ID was not specified, generate one
            let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);

-            let generation = if env.control_plane_api.is_some() {
-                // We must register the tenant with the attachment service, so
-                // that when the pageserver restarts, it will be re-attached.
-                let attachment_service = AttachmentService::from_env(env);
-                attachment_service
-                    .attach_hook(tenant_id, pageserver.conf.id)
-                    .await?
-            } else {
-                None
-            };
-
-            pageserver
-                .tenant_create(tenant_id, generation, tenant_conf)
+            // We must register the tenant with the attachment service, so
+            // that when the pageserver restarts, it will be re-attached.
+            let attachment_service = AttachmentService::from_env(env);
+            attachment_service
+                .tenant_create(TenantCreateRequest {
+                    // Note that ::unsharded here isn't actually because the tenant is unsharded, its because the
+                    // attachment service expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
+                    // type is used both in attachment service (for creating tenants) and in pageserver (for creating shards)
+                    new_tenant_id: TenantShardId::unsharded(tenant_id),
+                    generation: None,
+                    shard_parameters: ShardParameters {
+                        count: ShardCount(shard_count),
+                        stripe_size: shard_stripe_size
+                            .map(ShardStripeSize)
+                            .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
+                    },
+                    config: tenant_conf,
+                })
                .await?;
            println!("tenant {tenant_id} successfully created on the pageserver");

            // Create an initial timeline for the new tenant
-            let new_timeline_id = parse_timeline_id(create_match)?;
+            let new_timeline_id =
+                parse_timeline_id(create_match)?.unwrap_or(TimelineId::generate());
            let pg_version = create_match
                .get_one::<u32>("pg-version")
                .copied()
                .context("Failed to parse postgres version from the argument string")?;

-            let timeline_info = pageserver
-                .timeline_create(
+            // FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have
+            // different shards picking different start lsns.  Maybe we have to teach attachment service
+            // to let shard 0 branch first and then propagate the chosen LSN to other shards.
+            attachment_service
+                .tenant_timeline_create(
                    tenant_id,
-                    new_timeline_id,
-                    None,
-                    None,
-                    Some(pg_version),
-                    None,
+                    TimelineCreateRequest {
+                        new_timeline_id,
+                        ancestor_timeline_id: None,
+                        ancestor_start_lsn: None,
+                        existing_initdb_timeline_id: None,
+                        pg_version: Some(pg_version),
+                    },
                )
                .await?;
-            let new_timeline_id = timeline_info.timeline_id;
-            let last_record_lsn = timeline_info.last_record_lsn;

            env.register_branch_mapping(
                DEFAULT_BRANCH_NAME.to_string(),
@@ -442,9 +490,7 @@ async fn handle_tenant(
                new_timeline_id,
            )?;

-            println!(
-                "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {tenant_id}",
-            );
+            println!("Created an initial timeline '{new_timeline_id}' for tenant: {tenant_id}",);

            if create_match.get_flag("set-default") {
                println!("Setting tenant {tenant_id} as a default one");
@@ -471,14 +517,64 @@ async fn handle_tenant(
            println!("tenant {tenant_id} successfully configured on the pageserver");
        }
        Some(("migrate", matches)) => {
-            let tenant_id = get_tenant_id(matches, env)?;
+            let tenant_shard_id = get_tenant_shard_id(matches, env)?;
            let new_pageserver = get_pageserver(env, matches)?;
            let new_pageserver_id = new_pageserver.conf.id;

-            migrate_tenant(env, tenant_id, new_pageserver).await?;
-            println!("tenant {tenant_id} migrated to {}", new_pageserver_id);
-        }
+            let attachment_service = AttachmentService::from_env(env);
+            attachment_service
+                .tenant_migrate(tenant_shard_id, new_pageserver_id)
+                .await?;

+            println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
+        }
+        Some(("status", matches)) => {
+            let tenant_id = get_tenant_id(matches, env)?;
+
+            let mut shard_table = comfy_table::Table::new();
+            shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
+
+            let mut tenant_synthetic_size = None;
+
+            let attachment_service = AttachmentService::from_env(env);
+            for shard in attachment_service.tenant_locate(tenant_id).await?.shards {
+                let pageserver =
+                    PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
+
+                let size = pageserver
+                    .http_client
+                    .tenant_details(shard.shard_id)
+                    .await?
+                    .tenant_info
+                    .current_physical_size
+                    .unwrap();
+
+                shard_table.add_row([
+                    format!("{}", shard.shard_id.shard_slug()),
+                    format!("{}", shard.node_id.0),
+                    format!("{} MiB", size / (1024 * 1024)),
+                ]);
+
+                if shard.shard_id.is_zero() {
+                    tenant_synthetic_size =
+                        Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
+                }
+            }
+
+            let Some(synthetic_size) = tenant_synthetic_size else {
+                bail!("Shard 0 not found")
+            };
+
+            let mut tenant_table = comfy_table::Table::new();
+            tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
+            tenant_table.add_row([
+                "Synthetic size".to_string(),
+                format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
+            ]);
+
+            println!("{tenant_table}");
+            println!("{shard_table}");
+        }
        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
        None => bail!("no tenant subcommand provided"),
    }
@@ -490,10 +586,10 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local

    match timeline_match.subcommand() {
        Some(("list", list_match)) => {
-            let tenant_id = get_tenant_id(list_match, env)?;
-            let timelines = pageserver
-                .timeline_list(&TenantShardId::unsharded(tenant_id))
-                .await?;
+            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
+            // where shard 0 is attached, and query there.
+            let tenant_shard_id = get_tenant_shard_id(list_match, env)?;
+            let timelines = pageserver.timeline_list(&tenant_shard_id).await?;
            print_timelines_tree(timelines, env.timeline_name_mappings())?;
        }
        Some(("create", create_match)) => {
@@ -508,18 +604,19 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                .context("Failed to parse postgres version from the argument string")?;

            let new_timeline_id_opt = parse_timeline_id(create_match)?;
+            let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate());

-            let timeline_info = pageserver
-                .timeline_create(
-                    tenant_id,
-                    new_timeline_id_opt,
-                    None,
-                    None,
-                    Some(pg_version),
-                    None,
-                )
+            let attachment_service = AttachmentService::from_env(env);
+            let create_req = TimelineCreateRequest {
+                new_timeline_id,
+                ancestor_timeline_id: None,
+                existing_initdb_timeline_id: None,
+                ancestor_start_lsn: None,
+                pg_version: Some(pg_version),
+            };
+            let timeline_info = attachment_service
+                .tenant_timeline_create(tenant_id, create_req)
                .await?;
-            let new_timeline_id = timeline_info.timeline_id;

            let last_record_lsn = timeline_info.last_record_lsn;
            env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;
@@ -577,7 +674,6 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                None,
                pg_version,
                ComputeMode::Primary,
-                DEFAULT_PAGESERVER_ID,
            )?;
            println!("Done");
        }
@@ -601,17 +697,18 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                .map(|lsn_str| Lsn::from_str(lsn_str))
                .transpose()
                .context("Failed to parse ancestor start Lsn from the request")?;
-            let timeline_info = pageserver
-                .timeline_create(
-                    tenant_id,
-                    None,
-                    start_lsn,
-                    Some(ancestor_timeline_id),
-                    None,
-                    None,
-                )
+            let new_timeline_id = TimelineId::generate();
+            let attachment_service = AttachmentService::from_env(env);
+            let create_req = TimelineCreateRequest {
+                new_timeline_id,
+                ancestor_timeline_id: Some(ancestor_timeline_id),
+                existing_initdb_timeline_id: None,
+                ancestor_start_lsn: start_lsn,
+                pg_version: None,
+            };
+            let timeline_info = attachment_service
+                .tenant_timeline_create(tenant_id, create_req)
                .await?;
-            let new_timeline_id = timeline_info.timeline_id;

            let last_record_lsn = timeline_info.last_record_lsn;

@@ -638,8 +735,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re

    match sub_name {
        "list" => {
-            let tenant_id = get_tenant_id(sub_args, env)?;
-            let timeline_infos = get_timeline_infos(env, &tenant_id)
+            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
+            // where shard 0 is attached, and query there.
+            let tenant_shard_id = get_tenant_shard_id(sub_args, env)?;
+            let timeline_infos = get_timeline_infos(env, &tenant_shard_id)
                .await
                .unwrap_or_else(|e| {
                    eprintln!("Failed to load timeline info: {}", e);
@@ -664,7 +763,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
            for (endpoint_id, endpoint) in cplane
                .endpoints
                .iter()
-                .filter(|(_, endpoint)| endpoint.tenant_id == tenant_id)
+                .filter(|(_, endpoint)| endpoint.tenant_id == tenant_shard_id.tenant_id)
            {
                let lsn_str = match endpoint.mode {
                    ComputeMode::Static(lsn) => {
@@ -683,7 +782,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                };

                let branch_name = timeline_name_mappings
-                    .get(&TenantTimelineId::new(tenant_id, endpoint.timeline_id))
+                    .get(&TenantTimelineId::new(
+                        tenant_shard_id.tenant_id,
+                        endpoint.timeline_id,
+                    ))
                    .map(|name| name.as_str())
                    .unwrap_or("?");

@@ -731,13 +833,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                .copied()
                .unwrap_or(false);

-            let pageserver_id =
-                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
-                    NodeId(id_str.parse().context("while parsing pageserver id")?)
-                } else {
-                    DEFAULT_PAGESERVER_ID
-                };
-
            let mode = match (lsn, hot_standby) {
                (Some(lsn), false) => ComputeMode::Static(lsn),
                (None, true) => ComputeMode::Replica,
@@ -765,7 +860,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                http_port,
                pg_version,
                mode,
-                pageserver_id,
            )?;
        }
        "start" => {
@@ -775,9 +869,11 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re

            let pageserver_id =
                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
-                    NodeId(id_str.parse().context("while parsing pageserver id")?)
+                    Some(NodeId(
+                        id_str.parse().context("while parsing pageserver id")?,
+                    ))
                } else {
-                    DEFAULT_PAGESERVER_ID
+                    None
                };

            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
@@ -808,7 +904,38 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                endpoint.timeline_id,
            )?;

-            let ps_conf = env.get_pageserver_conf(pageserver_id)?;
+            let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
+                let conf = env.get_pageserver_conf(pageserver_id).unwrap();
+                let parsed = parse_host_port(&conf.listen_pg_addr).expect("Bad config");
+                (
+                    vec![(parsed.0, parsed.1.unwrap_or(5432))],
+                    // If caller is telling us what pageserver to use, this is not a tenant which is
+                    // full managed by attachment service, therefore not sharded.
+                    ShardParameters::DEFAULT_STRIPE_SIZE,
+                )
+            } else {
+                // Look up the currently attached location of the tenant, and its striping metadata,
+                // to pass these on to postgres.
+                let attachment_service = AttachmentService::from_env(env);
+                let locate_result = attachment_service.tenant_locate(endpoint.tenant_id).await?;
+                let pageservers = locate_result
+                    .shards
+                    .into_iter()
+                    .map(|shard| {
+                        (
+                            Host::parse(&shard.listen_pg_addr)
+                                .expect("Attachment service reported bad hostname"),
+                            shard.listen_pg_port,
+                        )
+                    })
+                    .collect::<Vec<_>>();
+                let stripe_size = locate_result.shard_params.stripe_size;
+
+                (pageservers, stripe_size)
+            };
+            assert!(!pageservers.is_empty());
+
+            let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?;
            let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
                let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant);

@@ -819,7 +946,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re

            println!("Starting existing endpoint {endpoint_id}...");
            endpoint
-                .start(&auth_token, safekeepers, remote_ext_config)
+                .start(
+                    &auth_token,
+                    safekeepers,
+                    pageservers,
+                    remote_ext_config,
+                    stripe_size.0 as usize,
+                )
                .await?;
        }
        "reconfigure" => {
@@ -830,15 +963,31 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                .endpoints
                .get(endpoint_id.as_str())
                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
-            let pageserver_id =
+            let pageservers =
                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
-                    Some(NodeId(
-                        id_str.parse().context("while parsing pageserver id")?,
-                    ))
+                    let ps_id = NodeId(id_str.parse().context("while parsing pageserver id")?);
+                    let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(ps_id)?);
+                    vec![(
+                        pageserver.pg_connection_config.host().clone(),
+                        pageserver.pg_connection_config.port(),
+                    )]
                } else {
-                    None
+                    let attachment_service = AttachmentService::from_env(env);
+                    attachment_service
+                        .tenant_locate(endpoint.tenant_id)
+                        .await?
+                        .shards
+                        .into_iter()
+                        .map(|shard| {
+                            (
+                                Host::parse(&shard.listen_pg_addr)
+                                    .expect("Attachment service reported malformed host"),
+                                shard.listen_pg_port,
+                            )
+                        })
+                        .collect::<Vec<_>>()
                };
-            endpoint.reconfigure(pageserver_id).await?;
+            endpoint.reconfigure(pageservers).await?;
        }
        "stop" => {
            let endpoint_id = sub_args
@@ -962,6 +1111,21 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
            }
        }

+        Some(("set-state", subcommand_args)) => {
+            let pageserver = get_pageserver(env, subcommand_args)?;
+            let scheduling = subcommand_args.get_one("scheduling");
+            let availability = subcommand_args.get_one("availability");
+
+            let attachment_service = AttachmentService::from_env(env);
+            attachment_service
+                .node_configure(NodeConfigureRequest {
+                    node_id: pageserver.conf.id,
+                    scheduling: scheduling.cloned(),
+                    availability: availability.cloned(),
+                })
+                .await?;
+        }
+
        Some(("status", subcommand_args)) => {
            match get_pageserver(env, subcommand_args)?.check_status().await {
                Ok(_) => println!("Page server is up and running"),
@@ -1361,6 +1525,8 @@ fn cli() -> Command {
                .arg(pg_version_arg.clone())
                .arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false)
                    .help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
+                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
+                .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
                )
            .subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
                .about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
@@ -1371,6 +1537,9 @@ fn cli() -> Command {
                .about("Migrate a tenant from one pageserver to another")
                .arg(tenant_id_arg.clone())
                .arg(pageserver_id_arg.clone()))
+            .subcommand(Command::new("status")
+                .about("Human readable summary of the tenant's shards and attachment locations")
+                .arg(tenant_id_arg.clone()))
        )
        .subcommand(
            Command::new("pageserver")
@@ -1390,6 +1559,12 @@ fn cli() -> Command {
                    .about("Restart local pageserver")
                    .arg(pageserver_config_args.clone())
                )
+                .subcommand(Command::new("set-state")
+                    .arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
+                    .arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
+                    .about("Set scheduling or availability state of pageserver node")
+                    .arg(pageserver_config_args.clone())
+                )
        )
        .subcommand(
            Command::new("attachment_service")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -48,16 +48,16 @@ use anyhow::{anyhow, bail, Context, Result};
 use compute_api::spec::RemoteExtSpec;
 use nix::sys::signal::kill;
 use nix::sys::signal::Signal;
-use pageserver_api::models::ShardParameters;
 use serde::{Deserialize, Serialize};
+use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};

+use crate::attachment_service::AttachmentService;
 use crate::local_env::LocalEnv;
-use crate::pageserver::PageServerNode;
 use crate::postgresql_conf::PostgresConf;

 use compute_api::responses::{ComputeState, ComputeStatus};
-use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};
+use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};

 // contents of a endpoint.json file
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
@@ -70,7 +70,7 @@ pub struct EndpointConf {
    http_port: u16,
    pg_version: u32,
    skip_pg_catalog_updates: bool,
-    pageserver_id: NodeId,
+    features: Vec<ComputeFeature>,
 }

 //
@@ -122,19 +122,14 @@ impl ComputeControlPlane {
        http_port: Option<u16>,
        pg_version: u32,
        mode: ComputeMode,
-        pageserver_id: NodeId,
    ) -> Result<Arc<Endpoint>> {
        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
-        let pageserver =
-            PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
-
        let ep = Arc::new(Endpoint {
            endpoint_id: endpoint_id.to_owned(),
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
            env: self.env.clone(),
-            pageserver,
            timeline_id,
            mode,
            tenant_id,
@@ -146,6 +141,7 @@ impl ComputeControlPlane {
            // with this we basically test a case of waking up an idle compute, where
            // we also skip catalog updates in the cloud.
            skip_pg_catalog_updates: true,
+            features: vec![],
        });

        ep.create_endpoint_dir()?;
@@ -160,7 +156,7 @@ impl ComputeControlPlane {
                pg_port,
                pg_version,
                skip_pg_catalog_updates: true,
-                pageserver_id,
+                features: vec![],
            })?,
        )?;
        std::fs::write(
@@ -219,10 +215,12 @@ pub struct Endpoint {
    // These are not part of the endpoint as such, but the environment
    // the endpoint runs in.
    pub env: LocalEnv,
-    pageserver: PageServerNode,

    // Optimizations
    skip_pg_catalog_updates: bool,
+
+    // Feature flags
+    features: Vec<ComputeFeature>,
 }

 impl Endpoint {
@@ -242,20 +240,17 @@ impl Endpoint {
        let conf: EndpointConf =
            serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;

-        let pageserver =
-            PageServerNode::from_env(env, env.get_pageserver_conf(conf.pageserver_id)?);
-
        Ok(Endpoint {
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
            endpoint_id,
            env: env.clone(),
-            pageserver,
            timeline_id: conf.timeline_id,
            mode: conf.mode,
            tenant_id: conf.tenant_id,
            pg_version: conf.pg_version,
            skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
+            features: conf.features,
        })
    }

@@ -470,11 +465,21 @@ impl Endpoint {
        }
    }

+    fn build_pageserver_connstr(pageservers: &[(Host, u16)]) -> String {
+        pageservers
+            .iter()
+            .map(|(host, port)| format!("postgresql://no_user@{host}:{port}"))
+            .collect::<Vec<_>>()
+            .join(",")
+    }
+
    pub async fn start(
        &self,
        auth_token: &Option<String>,
        safekeepers: Vec<NodeId>,
+        pageservers: Vec<(Host, u16)>,
        remote_ext_config: Option<&String>,
+        shard_stripe_size: usize,
    ) -> Result<()> {
        if self.status() == "running" {
            anyhow::bail!("The endpoint is already running");
@@ -488,13 +493,9 @@ impl Endpoint {
            std::fs::remove_dir_all(self.pgdata())?;
        }

-        let pageserver_connstring = {
-            let config = &self.pageserver.pg_connection_config;
-            let (host, port) = (config.host(), config.port());
+        let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
+        assert!(!pageserver_connstring.is_empty());

-            // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
-            format!("postgresql://no_user@{host}:{port}")
-        };
        let mut safekeeper_connstrings = Vec::new();
        if self.mode == ComputeMode::Primary {
            for sk_id in safekeepers {
@@ -525,7 +526,7 @@ impl Endpoint {
            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
            format_version: 1.0,
            operation_uuid: None,
-            features: vec![],
+            features: self.features.clone(),
            cluster: Cluster {
                cluster_id: None, // project ID: not used
                name: None,       // project name: not used
@@ -544,7 +545,7 @@ impl Endpoint {
            storage_auth_token: auth_token.clone(),
            remote_extensions,
            pgbouncer_settings: None,
-            shard_stripe_size: Some(ShardParameters::DEFAULT_STRIPE_SIZE.0 as usize),
+            shard_stripe_size: Some(shard_stripe_size),
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -667,7 +668,7 @@ impl Endpoint {
        }
    }

-    pub async fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
+    pub async fn reconfigure(&self, mut pageservers: Vec<(Host, u16)>) -> Result<()> {
        let mut spec: ComputeSpec = {
            let spec_path = self.endpoint_path().join("spec.json");
            let file = std::fs::File::open(spec_path)?;
@@ -677,25 +678,27 @@ impl Endpoint {
        let postgresql_conf = self.read_postgresql_conf()?;
        spec.cluster.postgresql_conf = Some(postgresql_conf);

-        if let Some(pageserver_id) = pageserver_id {
-            let endpoint_config_path = self.endpoint_path().join("endpoint.json");
-            let mut endpoint_conf: EndpointConf = {
-                let file = std::fs::File::open(&endpoint_config_path)?;
-                serde_json::from_reader(file)?
-            };
-            endpoint_conf.pageserver_id = pageserver_id;
-            std::fs::write(
-                endpoint_config_path,
-                serde_json::to_string_pretty(&endpoint_conf)?,
-            )?;
-
-            let pageserver =
-                PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
-            let ps_http_conf = &pageserver.pg_connection_config;
-            let (host, port) = (ps_http_conf.host(), ps_http_conf.port());
-            spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
+        // If we weren't given explicit pageservers, query the attachment service
+        if pageservers.is_empty() {
+            let attachment_service = AttachmentService::from_env(&self.env);
+            let locate_result = attachment_service.tenant_locate(self.tenant_id).await?;
+            pageservers = locate_result
+                .shards
+                .into_iter()
+                .map(|shard| {
+                    (
+                        Host::parse(&shard.listen_pg_addr)
+                            .expect("Attachment service reported bad hostname"),
+                        shard.listen_pg_port,
+                    )
+                })
+                .collect::<Vec<_>>();
        }

+        let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
+        assert!(!pageserver_connstr.is_empty());
+        spec.pageserver_connstring = Some(pageserver_connstr);
+
        let client = reqwest::Client::new();
        let response = client
            .post(format!(
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -14,4 +14,3 @@ pub mod local_env;
 pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
-pub mod tenant_migration;
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -251,7 +251,13 @@ impl LocalEnv {
        if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) {
            Ok(conf)
        } else {
-            bail!("could not find pageserver {id}")
+            let have_ids = self
+                .pageservers
+                .iter()
+                .map(|node| format!("{}:{}", node.id, node.listen_http_addr))
+                .collect::<Vec<_>>();
+            let joined = have_ids.join(",");
+            bail!("could not find pageserver {id}, have ids {joined}")
        }
    }

--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,7 +17,9 @@ use std::time::Duration;
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
 use futures::SinkExt;
-use pageserver_api::models::{self, LocationConfig, ShardParameters, TenantInfo, TimelineInfo};
+use pageserver_api::models::{
+    self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
+};
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
@@ -106,6 +108,16 @@ impl PageServerNode {
                "control_plane_api='{}'",
                control_plane_api.as_str()
            ));
+
+            // Attachment service uses the same auth as pageserver: if JWT is enabled
+            // for us, we will also need it to talk to them.
+            if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
+                let jwt_token = self
+                    .env
+                    .generate_auth_token(&Claims::new(None, Scope::PageServerApi))
+                    .unwrap();
+                overrides.push(format!("control_plane_api_token='{}'", jwt_token));
+            }
        }

        if !cli_overrides
@@ -301,16 +313,8 @@ impl PageServerNode {
    pub async fn tenant_list(&self) -> mgmt_api::Result<Vec<TenantInfo>> {
        self.http_client.list_tenants().await
    }
-
-    pub async fn tenant_create(
-        &self,
-        new_tenant_id: TenantId,
-        generation: Option<u32>,
-        settings: HashMap<&str, &str>,
-    ) -> anyhow::Result<TenantId> {
-        let mut settings = settings.clone();
-
-        let config = models::TenantConfig {
+    pub fn parse_config(mut settings: HashMap<&str, &str>) -> anyhow::Result<models::TenantConfig> {
+        let result = models::TenantConfig {
            checkpoint_distance: settings
                .remove("checkpoint_distance")
                .map(|x| x.parse::<u64>())
@@ -371,6 +375,20 @@ impl PageServerNode {
                .context("Failed to parse 'gc_feedback' as bool")?,
            heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
        };
+        if !settings.is_empty() {
+            bail!("Unrecognized tenant settings: {settings:?}")
+        } else {
+            Ok(result)
+        }
+    }
+
+    pub async fn tenant_create(
+        &self,
+        new_tenant_id: TenantId,
+        generation: Option<u32>,
+        settings: HashMap<&str, &str>,
+    ) -> anyhow::Result<TenantId> {
+        let config = Self::parse_config(settings.clone())?;

        let request = models::TenantCreateRequest {
            new_tenant_id: TenantShardId::unsharded(new_tenant_id),
@@ -498,15 +516,13 @@ impl PageServerNode {

    pub async fn timeline_create(
        &self,
-        tenant_id: TenantId,
-        new_timeline_id: Option<TimelineId>,
+        tenant_shard_id: TenantShardId,
+        new_timeline_id: TimelineId,
        ancestor_start_lsn: Option<Lsn>,
        ancestor_timeline_id: Option<TimelineId>,
        pg_version: Option<u32>,
        existing_initdb_timeline_id: Option<TimelineId>,
    ) -> anyhow::Result<TimelineInfo> {
-        // If timeline ID was not specified, generate one
-        let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
        let req = models::TimelineCreateRequest {
            new_timeline_id,
            ancestor_start_lsn,
@@ -514,7 +530,10 @@ impl PageServerNode {
            pg_version,
            existing_initdb_timeline_id,
        };
-        Ok(self.http_client.timeline_create(tenant_id, &req).await?)
+        Ok(self
+            .http_client
+            .timeline_create(tenant_shard_id, &req)
+            .await?)
    }

    /// Import a basebackup prepared using either:
@@ -592,4 +611,14 @@ impl PageServerNode {

        Ok(())
    }
+
+    pub async fn tenant_synthetic_size(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> anyhow::Result<TenantHistorySize> {
+        Ok(self
+            .http_client
+            .tenant_synthetic_size(tenant_shard_id)
+            .await?)
+    }
 }
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -1,232 +0,0 @@
-//!
-//! Functionality for migrating tenants across pageservers: unlike most of neon_local, this code
-//! isn't scoped to a particular physical service, as it needs to update compute endpoints to
-//! point to the new pageserver.
-//!
-use crate::local_env::LocalEnv;
-use crate::{
-    attachment_service::AttachmentService, endpoint::ComputeControlPlane,
-    pageserver::PageServerNode,
-};
-use pageserver_api::models::{
-    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
-};
-use pageserver_api::shard::TenantShardId;
-use std::collections::HashMap;
-use std::time::Duration;
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
-
-/// Given an attached pageserver, retrieve the LSN for all timelines
-async fn get_lsns(
-    tenant_id: TenantId,
-    pageserver: &PageServerNode,
-) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
-    let timelines = pageserver
-        .timeline_list(&TenantShardId::unsharded(tenant_id))
-        .await?;
-    Ok(timelines
-        .into_iter()
-        .map(|t| (t.timeline_id, t.last_record_lsn))
-        .collect())
-}
-
-/// Wait for the timeline LSNs on `pageserver` to catch up with or overtake
-/// `baseline`.
-async fn await_lsn(
-    tenant_id: TenantId,
-    pageserver: &PageServerNode,
-    baseline: HashMap<TimelineId, Lsn>,
-) -> anyhow::Result<()> {
-    loop {
-        let latest = match get_lsns(tenant_id, pageserver).await {
-            Ok(l) => l,
-            Err(_e) => {
-                println!(
-                    "🕑 Waiting for pageserver {} to activate...",
-                    pageserver.conf.id
-                );
-                std::thread::sleep(Duration::from_millis(500));
-                continue;
-            }
-        };
-
-        let mut any_behind: bool = false;
-        for (timeline_id, baseline_lsn) in &baseline {
-            match latest.get(timeline_id) {
-                Some(latest_lsn) => {
-                    println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
-                    if latest_lsn < baseline_lsn {
-                        any_behind = true;
-                    }
-                }
-                None => {
-                    // Expected timeline isn't yet visible on migration destination.
-                    // (IRL we would have to account for timeline deletion, but this
-                    //  is just test helper)
-                    any_behind = true;
-                }
-            }
-        }
-
-        if !any_behind {
-            println!("✅ LSN caught up.  Proceeding...");
-            break;
-        } else {
-            std::thread::sleep(Duration::from_millis(500));
-        }
-    }
-
-    Ok(())
-}
-
-/// This function spans multiple services, to demonstrate live migration of a tenant
-/// between pageservers:
-///  - Coordinate attach/secondary/detach on pageservers
-///  - call into attachment_service for generations
-///  - reconfigure compute endpoints to point to new attached pageserver
-pub async fn migrate_tenant(
-    env: &LocalEnv,
-    tenant_id: TenantId,
-    dest_ps: PageServerNode,
-) -> anyhow::Result<()> {
-    println!("🤔 Checking existing status...");
-    let attachment_service = AttachmentService::from_env(env);
-
-    fn build_location_config(
-        mode: LocationConfigMode,
-        generation: Option<u32>,
-        secondary_conf: Option<LocationConfigSecondary>,
-    ) -> LocationConfig {
-        LocationConfig {
-            mode,
-            generation,
-            secondary_conf,
-            tenant_conf: TenantConfig::default(),
-            shard_number: 0,
-            shard_count: 0,
-            shard_stripe_size: 0,
-        }
-    }
-
-    let previous = attachment_service.inspect(tenant_id).await?;
-    let mut baseline_lsns = None;
-    if let Some((generation, origin_ps_id)) = &previous {
-        let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?);
-
-        if origin_ps_id == &dest_ps.conf.id {
-            println!("🔁 Already attached to {origin_ps_id}, freshening...");
-            let gen = attachment_service
-                .attach_hook(tenant_id, dest_ps.conf.id)
-                .await?;
-            let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
-            dest_ps
-                .location_config(TenantShardId::unsharded(tenant_id), dest_conf, None)
-                .await?;
-            println!("✅ Migration complete");
-            return Ok(());
-        }
-
-        println!("🔁 Switching origin pageserver {origin_ps_id} to stale mode");
-
-        let stale_conf =
-            build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
-        origin_ps
-            .location_config(
-                TenantShardId::unsharded(tenant_id),
-                stale_conf,
-                Some(Duration::from_secs(10)),
-            )
-            .await?;
-
-        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
-    }
-
-    println!(
-        "🔁 Downloading latest layers to destination pageserver {}",
-        dest_ps.conf.id
-    );
-    match dest_ps
-        .tenant_secondary_download(&TenantShardId::unsharded(tenant_id))
-        .await
-    {
-        Ok(()) => {}
-        Err(_) => {
-            println!("  (skipping, destination wasn't in secondary mode)")
-        }
-    }
-
-    let gen = attachment_service
-        .attach_hook(tenant_id, dest_ps.conf.id)
-        .await?;
-    let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);
-
-    println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
-    dest_ps
-        .location_config(TenantShardId::unsharded(tenant_id), dest_conf, None)
-        .await?;
-
-    if let Some(baseline) = baseline_lsns {
-        println!("🕑 Waiting for LSN to catch up...");
-        await_lsn(tenant_id, &dest_ps, baseline).await?;
-    }
-
-    let cplane = ComputeControlPlane::load(env.clone())?;
-    for (endpoint_name, endpoint) in &cplane.endpoints {
-        if endpoint.tenant_id == tenant_id {
-            println!(
-                "🔁 Reconfiguring endpoint {} to use pageserver {}",
-                endpoint_name, dest_ps.conf.id
-            );
-            endpoint.reconfigure(Some(dest_ps.conf.id)).await?;
-        }
-    }
-
-    for other_ps_conf in &env.pageservers {
-        if other_ps_conf.id == dest_ps.conf.id {
-            continue;
-        }
-
-        let other_ps = PageServerNode::from_env(env, other_ps_conf);
-        let other_ps_tenants = other_ps.tenant_list().await?;
-
-        // Check if this tenant is attached
-        let found = other_ps_tenants
-            .into_iter()
-            .map(|t| t.id)
-            .any(|i| i.tenant_id == tenant_id);
-        if !found {
-            continue;
-        }
-
-        // Downgrade to a secondary location
-        let secondary_conf = build_location_config(
-            LocationConfigMode::Secondary,
-            None,
-            Some(LocationConfigSecondary { warm: true }),
-        );
-
-        println!(
-            "💤 Switching to secondary mode on pageserver {}",
-            other_ps.conf.id
-        );
-        other_ps
-            .location_config(TenantShardId::unsharded(tenant_id), secondary_conf, None)
-            .await?;
-    }
-
-    println!(
-        "🔁 Switching to AttachedSingle mode on pageserver {}",
-        dest_ps.conf.id
-    );
-    let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
-    dest_ps
-        .location_config(TenantShardId::unsharded(tenant_id), dest_conf, None)
-        .await?;
-
-    println!("✅ Migration complete");
-
-    Ok(())
-}
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -90,6 +90,9 @@ pub enum ComputeFeature {
    /// track short-lived connections as user activity.
    ActivityMonitorExperimental,

+    /// Enable running migrations
+    Migrations,
+
    /// This is a special feature flag that is used to represent unknown feature flags.
    /// Basically all unknown to enum flags are represented as this one. See unit test
    /// `parse_unknown_features()` for more details.
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -19,6 +19,7 @@ strum.workspace = true
 strum_macros.workspace = true
 hex.workspace = true
 thiserror.workspace = true
+humantime-serde.workspace = true

 workspace_hack.workspace = true

--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,9 +1,11 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
+use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
+use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
-use std::fmt;
+use std::{fmt, ops::Range};

-use crate::reltag::{BlockNumber, RelTag};
+use crate::reltag::{BlockNumber, RelTag, SlruKind};

 /// Key used in the Repository kv-store.
 ///
@@ -143,12 +145,390 @@ impl Key {
    }
 }

+// Layout of the Key address space
+//
+// The Key struct, used to address the underlying key-value store, consists of
+// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map
+// all the data and metadata keys into those 18 bytes.
+//
+// Principles for the mapping:
+//
+// - Things that are often accessed or modified together, should be close to
+//   each other in the key space. For example, if a relation is extended by one
+//   block, we create a new key-value pair for the block data, and update the
+//   relation size entry. Because of that, the RelSize key comes after all the
+//   RelBlocks of a relation: the RelSize and the last RelBlock are always next
+//   to each other.
+//
+// The key space is divided into four major sections, identified by the first
+// byte, and the form a hierarchy:
+//
+// 00 Relation data and metadata
+//
+//   DbDir    () -> (dbnode, spcnode)
+//   Filenodemap
+//   RelDir   -> relnode forknum
+//       RelBlocks
+//       RelSize
+//
+// 01 SLRUs
+//
+//   SlruDir  kind
+//   SlruSegBlocks segno
+//   SlruSegSize
+//
+// 02 pg_twophase
+//
+// 03 misc
+//    Controlfile
+//    checkpoint
+//    pg_version
+//
+// 04 aux files
+//
+// Below is a full list of the keyspace allocation:
+//
+// DbDir:
+// 00 00000000 00000000 00000000 00   00000000
+//
+// Filenodemap:
+// 00 SPCNODE  DBNODE   00000000 00   00000000
+//
+// RelDir:
+// 00 SPCNODE  DBNODE   00000000 00   00000001 (Postgres never uses relfilenode 0)
+//
+// RelBlock:
+// 00 SPCNODE  DBNODE   RELNODE  FORK BLKNUM
+//
+// RelSize:
+// 00 SPCNODE  DBNODE   RELNODE  FORK FFFFFFFF
+//
+// SlruDir:
+// 01 kind     00000000 00000000 00   00000000
+//
+// SlruSegBlock:
+// 01 kind     00000001 SEGNO    00   BLKNUM
+//
+// SlruSegSize:
+// 01 kind     00000001 SEGNO    00   FFFFFFFF
+//
+// TwoPhaseDir:
+// 02 00000000 00000000 00000000 00   00000000
+//
+// TwoPhaseFile:
+// 02 00000000 00000000 00000000 00   XID
+//
+// ControlFile:
+// 03 00000000 00000000 00000000 00   00000000
+//
+// Checkpoint:
+// 03 00000000 00000000 00000000 00   00000001
+//
+// AuxFiles:
+// 03 00000000 00000000 00000000 00   00000002
+//
+
+//-- Section 01: relation data and metadata
+
+pub const DBDIR_KEY: Key = Key {
+    field1: 0x00,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 0,
+};
+
+#[inline(always)]
+pub fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
+    Key {
+        field1: 0x00,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0,
+        field5: 0,
+        field6: 0,
+    }..Key {
+        field1: 0x00,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0xffffffff,
+        field5: 0xff,
+        field6: 0xffffffff,
+    }
+}
+
+#[inline(always)]
+pub fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key {
+    Key {
+        field1: 0x00,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0,
+        field5: 0,
+        field6: 0,
+    }
+}
+
+#[inline(always)]
+pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
+    Key {
+        field1: 0x00,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0,
+        field5: 0,
+        field6: 1,
+    }
+}
+
+#[inline(always)]
+pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
+    Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum,
+        field6: blknum,
+    }
+}
+
+#[inline(always)]
+pub fn rel_size_to_key(rel: RelTag) -> Key {
+    Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum,
+        field6: 0xffffffff,
+    }
+}
+
+#[inline(always)]
+pub fn rel_key_range(rel: RelTag) -> Range<Key> {
+    Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum,
+        field6: 0,
+    }..Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum + 1,
+        field6: 0,
+    }
+}
+
+//-- Section 02: SLRUs
+
+#[inline(always)]
+pub fn slru_dir_to_key(kind: SlruKind) -> Key {
+    Key {
+        field1: 0x01,
+        field2: match kind {
+            SlruKind::Clog => 0x00,
+            SlruKind::MultiXactMembers => 0x01,
+            SlruKind::MultiXactOffsets => 0x02,
+        },
+        field3: 0,
+        field4: 0,
+        field5: 0,
+        field6: 0,
+    }
+}
+
+#[inline(always)]
+pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
+    Key {
+        field1: 0x01,
+        field2: match kind {
+            SlruKind::Clog => 0x00,
+            SlruKind::MultiXactMembers => 0x01,
+            SlruKind::MultiXactOffsets => 0x02,
+        },
+        field3: 1,
+        field4: segno,
+        field5: 0,
+        field6: blknum,
+    }
+}
+
+#[inline(always)]
+pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
+    Key {
+        field1: 0x01,
+        field2: match kind {
+            SlruKind::Clog => 0x00,
+            SlruKind::MultiXactMembers => 0x01,
+            SlruKind::MultiXactOffsets => 0x02,
+        },
+        field3: 1,
+        field4: segno,
+        field5: 0,
+        field6: 0xffffffff,
+    }
+}
+
+#[inline(always)]
+pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
+    let field2 = match kind {
+        SlruKind::Clog => 0x00,
+        SlruKind::MultiXactMembers => 0x01,
+        SlruKind::MultiXactOffsets => 0x02,
+    };
+
+    Key {
+        field1: 0x01,
+        field2,
+        field3: 1,
+        field4: segno,
+        field5: 0,
+        field6: 0,
+    }..Key {
+        field1: 0x01,
+        field2,
+        field3: 1,
+        field4: segno,
+        field5: 1,
+        field6: 0,
+    }
+}
+
+//-- Section 03: pg_twophase
+
+pub const TWOPHASEDIR_KEY: Key = Key {
+    field1: 0x02,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 0,
+};
+
+#[inline(always)]
+pub fn twophase_file_key(xid: TransactionId) -> Key {
+    Key {
+        field1: 0x02,
+        field2: 0,
+        field3: 0,
+        field4: 0,
+        field5: 0,
+        field6: xid,
+    }
+}
+
+#[inline(always)]
+pub fn twophase_key_range(xid: TransactionId) -> Range<Key> {
+    let (next_xid, overflowed) = xid.overflowing_add(1);
+
+    Key {
+        field1: 0x02,
+        field2: 0,
+        field3: 0,
+        field4: 0,
+        field5: 0,
+        field6: xid,
+    }..Key {
+        field1: 0x02,
+        field2: 0,
+        field3: 0,
+        field4: 0,
+        field5: u8::from(overflowed),
+        field6: next_xid,
+    }
+}
+
+//-- Section 03: Control file
+pub const CONTROLFILE_KEY: Key = Key {
+    field1: 0x03,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 0,
+};
+
+pub const CHECKPOINT_KEY: Key = Key {
+    field1: 0x03,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 1,
+};
+
+pub const AUX_FILES_KEY: Key = Key {
+    field1: 0x03,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 2,
+};
+
+// Reverse mappings for a few Keys.
+// These are needed by WAL redo manager.
+
+// AUX_FILES currently stores only data for logical replication (slots etc), and
+// we don't preserve these on a branch because safekeepers can't follow timeline
+// switch (and generally it likely should be optional), so ignore these.
+#[inline(always)]
+pub fn is_inherited_key(key: Key) -> bool {
+    key != AUX_FILES_KEY
+}
+
+#[inline(always)]
+pub fn is_rel_fsm_block_key(key: Key) -> bool {
+    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
+}
+
+#[inline(always)]
+pub fn is_rel_vm_block_key(key: Key) -> bool {
+    key.field1 == 0x00
+        && key.field4 != 0
+        && key.field5 == VISIBILITYMAP_FORKNUM
+        && key.field6 != 0xffffffff
+}
+
+#[inline(always)]
+pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
+    Ok(match key.field1 {
+        0x01 => {
+            let kind = match key.field2 {
+                0x00 => SlruKind::Clog,
+                0x01 => SlruKind::MultiXactMembers,
+                0x02 => SlruKind::MultiXactOffsets,
+                _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
+            };
+            let segno = key.field4;
+            let blknum = key.field6;
+
+            (kind, segno, blknum)
+        }
+        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
+    })
+}
+
+#[inline(always)]
+pub fn is_slru_block_key(key: Key) -> bool {
+    key.field1 == 0x01                // SLRU-related
+        && key.field3 == 0x00000001   // but not SlruDir
+        && key.field6 != 0xffffffff // and not SlruSegSize
+}
+
 #[inline(always)]
 pub fn is_rel_block_key(key: &Key) -> bool {
    key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
 }

 /// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
+#[inline(always)]
 pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
    Ok(match key.field1 {
        0x00 => (
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -4,7 +4,7 @@ use std::{
    collections::HashMap,
    io::{BufRead, Read},
    num::{NonZeroU64, NonZeroUsize},
-    time::SystemTime,
+    time::{Duration, SystemTime},
 };

 use byteorder::{BigEndian, ReadBytesExt};
@@ -251,7 +251,7 @@ impl std::ops::Deref for TenantCreateRequest {

 /// An alternative representation of `pageserver::tenant::TenantConf` with
 /// simpler types.
-#[derive(Serialize, Deserialize, Debug, Default)]
+#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
 pub struct TenantConfig {
    pub checkpoint_distance: Option<u64>,
    pub checkpoint_timeout: Option<String>,
@@ -266,21 +266,41 @@ pub struct TenantConfig {
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
    pub trace_read_requests: Option<bool>,
-    // We defer the parsing of the eviction_policy field to the request handler.
-    // Otherwise we'd have to move the types for eviction policy into this package.
-    // We might do that once the eviction feature has stabilizied.
-    // For now, this field is not even documented in the openapi_spec.yml.
-    pub eviction_policy: Option<serde_json::Value>,
+    pub eviction_policy: Option<EvictionPolicy>,
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
    pub gc_feedback: Option<bool>,
    pub heatmap_period: Option<String>,
 }

+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(tag = "kind")]
+pub enum EvictionPolicy {
+    NoEviction,
+    LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
+}
+
+impl EvictionPolicy {
+    pub fn discriminant_str(&self) -> &'static str {
+        match self {
+            EvictionPolicy::NoEviction => "NoEviction",
+            EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub struct EvictionPolicyLayerAccessThreshold {
+    #[serde(with = "humantime_serde")]
+    pub period: Duration,
+    #[serde(with = "humantime_serde")]
+    pub threshold: Duration,
+}
+
 /// A flattened analog of a `pagesever::tenant::LocationMode`, which
 /// lists out all possible states (and the virtual "Detached" state)
 /// in a flat form rather than using rust-style enums.
-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
 pub enum LocationConfigMode {
    AttachedSingle,
    AttachedMulti,
@@ -289,19 +309,21 @@ pub enum LocationConfigMode {
    Detached,
 }

-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
 pub struct LocationConfigSecondary {
    pub warm: bool,
 }

 /// An alternative representation of `pageserver::tenant::LocationConf`,
 /// for use in external-facing APIs.
-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
 pub struct LocationConfig {
    pub mode: LocationConfigMode,
    /// If attaching, in what generation?
    #[serde(default)]
    pub generation: Option<u32>,
+
+    // If requesting mode `Secondary`, configuration for that.
    #[serde(default)]
    pub secondary_conf: Option<LocationConfigSecondary>,

@@ -314,11 +336,17 @@ pub struct LocationConfig {
    #[serde(default)]
    pub shard_stripe_size: u32,

-    // If requesting mode `Secondary`, configuration for that.
-    // Custom storage configuration for the tenant, if any
+    // This configuration only affects attached mode, but should be provided irrespective
+    // of the mode, as a secondary location might transition on startup if the response
+    // to the `/re-attach` control plane API requests it.
    pub tenant_conf: TenantConfig,
 }

+#[derive(Serialize, Deserialize)]
+pub struct LocationConfigListResponse {
+    pub tenant_shards: Vec<(TenantShardId, Option<LocationConfig>)>,
+}
+
 #[derive(Serialize, Deserialize)]
 #[serde(transparent)]
 pub struct TenantCreateResponse(pub TenantId);
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -342,7 +342,7 @@ const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
 pub struct ShardIdentity {
    pub number: ShardNumber,
    pub count: ShardCount,
-    stripe_size: ShardStripeSize,
+    pub stripe_size: ShardStripeSize,
    layout: ShardLayout,
 }

--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -329,8 +329,8 @@ impl CheckPoint {
    ///
    /// Returns 'true' if the XID was updated.
    pub fn update_next_xid(&mut self, xid: u32) -> bool {
-        // nextXid should nw greater than any XID in WAL, so increment provided XID and check for wraparround.
-        let mut new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID);
+        // nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround.
+        let mut new_xid = std::cmp::max(xid.wrapping_add(1), pg_constants::FIRST_NORMAL_TRANSACTION_ID);
        // To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL.
        // XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
        new_xid =
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -0,0 +1,288 @@
+use anyhow::Context;
+use camino::Utf8Path;
+use remote_storage::RemotePath;
+use std::collections::HashSet;
+use std::sync::Arc;
+use test_context::test_context;
+use tracing::debug;
+
+use crate::common::{download_to_vec, upload_stream, wrap_stream};
+
+use super::{
+    MaybeEnabledStorage, MaybeEnabledStorageWithSimpleTestBlobs, MaybeEnabledStorageWithTestBlobs,
+};
+
+/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
+/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
+/// See the client creation in [`create_s3_client`] for details on the required env vars.
+/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
+/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
+///
+/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
+/// where
+/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
+/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
+///
+/// Then, verifies that the client does return correct prefixes when queried:
+/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
+/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
+///
+/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
+/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
+/// since current default AWS S3 pagination limit is 1000.
+/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
+///
+/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
+/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
+#[test_context(MaybeEnabledStorageWithTestBlobs)]
+#[tokio::test]
+async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorageWithTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledStorageWithTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledStorageWithTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("S3 init failed: {e:?}")
+        }
+    };
+
+    let test_client = Arc::clone(&ctx.enabled.client);
+    let expected_remote_prefixes = ctx.remote_prefixes.clone();
+
+    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
+        .context("common_prefix construction")?;
+    let root_remote_prefixes = test_client
+        .list_prefixes(None)
+        .await
+        .context("client list root prefixes failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
+        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
+    );
+
+    let nested_remote_prefixes = test_client
+        .list_prefixes(Some(&base_prefix))
+        .await
+        .context("client list nested prefixes failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    let remote_only_prefixes = nested_remote_prefixes
+        .difference(&expected_remote_prefixes)
+        .collect::<HashSet<_>>();
+    let missing_uploaded_prefixes = expected_remote_prefixes
+        .difference(&nested_remote_prefixes)
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
+        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
+    );
+
+    Ok(())
+}
+
+/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries.
+/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
+/// See `s3_pagination_should_work` for more information.
+///
+/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
+/// Then performs the following queries:
+///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
+///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
+#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
+#[tokio::test]
+async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("S3 init failed: {e:?}")
+        }
+    };
+    let test_client = Arc::clone(&ctx.enabled.client);
+    let base_prefix =
+        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
+    let root_files = test_client
+        .list_files(None)
+        .await
+        .context("client list root files failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        root_files,
+        ctx.remote_blobs.clone(),
+        "remote storage list_files on root mismatches with the uploads."
+    );
+    let nested_remote_files = test_client
+        .list_files(Some(&base_prefix))
+        .await
+        .context("client list nested files failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    let trim_remote_blobs: HashSet<_> = ctx
+        .remote_blobs
+        .iter()
+        .map(|x| x.get_path())
+        .filter(|x| x.starts_with("folder1"))
+        .map(|x| RemotePath::new(x).expect("must be valid path"))
+        .collect();
+    assert_eq!(
+        nested_remote_files, trim_remote_blobs,
+        "remote storage list_files on subdirrectory mismatches with the uploads."
+    );
+    Ok(())
+}
+
+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorage::Enabled(ctx) => ctx,
+        MaybeEnabledStorage::Disabled => return Ok(()),
+    };
+
+    let path = RemotePath::new(Utf8Path::new(
+        format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
+    ))
+    .with_context(|| "RemotePath conversion")?;
+
+    ctx.client.delete(&path).await.expect("should succeed");
+
+    Ok(())
+}
+
+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorage::Enabled(ctx) => ctx,
+        MaybeEnabledStorage::Disabled => return Ok(()),
+    };
+
+    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
+    ctx.client.upload(data, len, &path1, None).await?;
+
+    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
+    ctx.client.upload(data, len, &path2, None).await?;
+
+    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
+    ctx.client.upload(data, len, &path3, None).await?;
+
+    ctx.client.delete_objects(&[path1, path2]).await?;
+
+    let prefixes = ctx.client.list_prefixes(None).await?;
+
+    assert_eq!(prefixes.len(), 1);
+
+    ctx.client.delete_objects(&[path3]).await?;
+
+    Ok(())
+}
+
+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
+    let MaybeEnabledStorage::Enabled(ctx) = ctx else {
+        return Ok(());
+    };
+
+    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
+
+    let (data, len) = wrap_stream(orig.clone());
+
+    ctx.client.upload(data, len, &path, None).await?;
+
+    // Normal download request
+    let dl = ctx.client.download(&path).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    // Full range (end specified)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 0, Some(len as u64))
+        .await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    // partial range (end specified)
+    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[4..10]);
+
+    // partial range (end beyond real end)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 8, Some(len as u64 * 100))
+        .await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[8..]);
+
+    // Partial range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[4..]);
+
+    // Full range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    debug!("Cleanup: deleting file at path {path:?}");
+    ctx.client
+        .delete(&path)
+        .await
+        .with_context(|| format!("{path:?} removal"))?;
+
+    Ok(())
+}
+
+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
+    let MaybeEnabledStorage::Enabled(ctx) = ctx else {
+        return Ok(());
+    };
+
+    let path = RemotePath::new(Utf8Path::new(
+        format!("{}/file_to_copy", ctx.base_prefix).as_str(),
+    ))
+    .with_context(|| "RemotePath conversion")?;
+    let path_dest = RemotePath::new(Utf8Path::new(
+        format!("{}/file_dest", ctx.base_prefix).as_str(),
+    ))
+    .with_context(|| "RemotePath conversion")?;
+
+    let orig = bytes::Bytes::from_static("remote blob data content".as_bytes());
+
+    let (data, len) = wrap_stream(orig.clone());
+
+    ctx.client.upload(data, len, &path, None).await?;
+
+    // Normal download request
+    ctx.client.copy_object(&path, &path_dest).await?;
+
+    let dl = ctx.client.download(&path_dest).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    debug!("Cleanup: deleting file at path {path:?}");
+    ctx.client
+        .delete_objects(&[path.clone(), path_dest.clone()])
+        .await
+        .with_context(|| format!("{path:?} removal"))?;
+
+    Ok(())
+}
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -6,301 +6,23 @@ use std::sync::Arc;
 use std::time::UNIX_EPOCH;

 use anyhow::Context;
-use camino::Utf8Path;
 use remote_storage::{
    AzureConfig, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
 };
-use test_context::{test_context, AsyncTestContext};
-use tracing::{debug, info};
+use test_context::AsyncTestContext;
+use tracing::info;

 mod common;

-use common::{
-    cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
-    upload_stream, wrap_stream,
-};
+#[path = "common/tests.rs"]
+mod tests_azure;
+
+use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data};

 const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE";

 const BASE_PREFIX: &str = "test";

-/// Tests that the Azure client can list all prefixes, even if the response comes paginated and requires multiple HTTP queries.
-/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified.
-/// See the client creation in [`create_azure_client`] for details on the required env vars.
-/// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
-/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
-///
-/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
-/// where
-/// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference
-/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
-///
-/// Then, verifies that the client does return correct prefixes when queried:
-/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
-/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
-///
-/// With the real Azure enabled and `#[cfg(test)]` Rust configuration used, the Azure client test adds a `max-keys` param to limit the response keys.
-/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to Azure.
-///
-/// Lastly, the test attempts to clean up and remove all uploaded Azure files.
-/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
-#[test_context(MaybeEnabledAzureWithTestBlobs)]
-#[tokio::test]
-async fn azure_pagination_should_work(
-    ctx: &mut MaybeEnabledAzureWithTestBlobs,
-) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledAzureWithTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledAzureWithTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledAzureWithTestBlobs::UploadsFailed(e, _) => {
-            anyhow::bail!("Azure init failed: {e:?}")
-        }
-    };
-
-    let test_client = Arc::clone(&ctx.enabled.client);
-    let expected_remote_prefixes = ctx.remote_prefixes.clone();
-
-    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
-        .context("common_prefix construction")?;
-    let root_remote_prefixes = test_client
-        .list_prefixes(None)
-        .await
-        .context("client list root prefixes failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
-        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
-    );
-
-    let nested_remote_prefixes = test_client
-        .list_prefixes(Some(&base_prefix))
-        .await
-        .context("client list nested prefixes failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    let remote_only_prefixes = nested_remote_prefixes
-        .difference(&expected_remote_prefixes)
-        .collect::<HashSet<_>>();
-    let missing_uploaded_prefixes = expected_remote_prefixes
-        .difference(&nested_remote_prefixes)
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
-        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
-    );
-
-    Ok(())
-}
-
-/// Tests that Azure client can list all files in a folder, even if the response comes paginated and requirees multiple Azure queries.
-/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set.
-/// See `Azure_pagination_should_work` for more information.
-///
-/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
-/// Then performs the following queries:
-///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
-///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
-#[test_context(MaybeEnabledAzureWithSimpleTestBlobs)]
-#[tokio::test]
-async fn azure_list_files_works(
-    ctx: &mut MaybeEnabledAzureWithSimpleTestBlobs,
-) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledAzureWithSimpleTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledAzureWithSimpleTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledAzureWithSimpleTestBlobs::UploadsFailed(e, _) => {
-            anyhow::bail!("Azure init failed: {e:?}")
-        }
-    };
-    let test_client = Arc::clone(&ctx.enabled.client);
-    let base_prefix =
-        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
-    let root_files = test_client
-        .list_files(None)
-        .await
-        .context("client list root files failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        root_files,
-        ctx.remote_blobs.clone(),
-        "remote storage list_files on root mismatches with the uploads."
-    );
-    let nested_remote_files = test_client
-        .list_files(Some(&base_prefix))
-        .await
-        .context("client list nested files failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    let trim_remote_blobs: HashSet<_> = ctx
-        .remote_blobs
-        .iter()
-        .map(|x| x.get_path())
-        .filter(|x| x.starts_with("folder1"))
-        .map(|x| RemotePath::new(x).expect("must be valid path"))
-        .collect();
-    assert_eq!(
-        nested_remote_files, trim_remote_blobs,
-        "remote storage list_files on subdirrectory mismatches with the uploads."
-    );
-    Ok(())
-}
-
-#[test_context(MaybeEnabledAzure)]
-#[tokio::test]
-async fn azure_delete_non_exising_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledAzure::Enabled(ctx) => ctx,
-        MaybeEnabledAzure::Disabled => return Ok(()),
-    };
-
-    let path = RemotePath::new(Utf8Path::new(
-        format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
-    ))
-    .with_context(|| "RemotePath conversion")?;
-
-    ctx.client.delete(&path).await.expect("should succeed");
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledAzure)]
-#[tokio::test]
-async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledAzure::Enabled(ctx) => ctx,
-        MaybeEnabledAzure::Disabled => return Ok(()),
-    };
-
-    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
-    ctx.client.upload(data, len, &path1, None).await?;
-
-    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
-    ctx.client.upload(data, len, &path2, None).await?;
-
-    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
-    ctx.client.upload(data, len, &path3, None).await?;
-
-    ctx.client.delete_objects(&[path1, path2]).await?;
-
-    let prefixes = ctx.client.list_prefixes(None).await?;
-
-    assert_eq!(prefixes.len(), 1);
-
-    ctx.client.delete_objects(&[path3]).await?;
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledAzure)]
-#[tokio::test]
-async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
-    let MaybeEnabledAzure::Enabled(ctx) = ctx else {
-        return Ok(());
-    };
-
-    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
-
-    let (data, len) = wrap_stream(orig.clone());
-
-    ctx.client.upload(data, len, &path, None).await?;
-
-    // Normal download request
-    let dl = ctx.client.download(&path).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    // Full range (end specified)
-    let dl = ctx
-        .client
-        .download_byte_range(&path, 0, Some(len as u64))
-        .await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    // partial range (end specified)
-    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[4..10]);
-
-    // partial range (end beyond real end)
-    let dl = ctx
-        .client
-        .download_byte_range(&path, 8, Some(len as u64 * 100))
-        .await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[8..]);
-
-    // Partial range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[4..]);
-
-    // Full range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    debug!("Cleanup: deleting file at path {path:?}");
-    ctx.client
-        .delete(&path)
-        .await
-        .with_context(|| format!("{path:?} removal"))?;
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledAzure)]
-#[tokio::test]
-async fn azure_copy_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
-    let MaybeEnabledAzure::Enabled(ctx) = ctx else {
-        return Ok(());
-    };
-
-    let path = RemotePath::new(Utf8Path::new(
-        format!("{}/file_to_copy", ctx.base_prefix).as_str(),
-    ))
-    .with_context(|| "RemotePath conversion")?;
-    let path_dest = RemotePath::new(Utf8Path::new(
-        format!("{}/file_dest", ctx.base_prefix).as_str(),
-    ))
-    .with_context(|| "RemotePath conversion")?;
-
-    let orig = bytes::Bytes::from_static("remote blob data content".as_bytes());
-
-    let (data, len) = wrap_stream(orig.clone());
-
-    ctx.client.upload(data, len, &path, None).await?;
-
-    // Normal download request
-    ctx.client.copy_object(&path, &path_dest).await?;
-
-    let dl = ctx.client.download(&path_dest).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    debug!("Cleanup: deleting file at path {path:?}");
-    ctx.client
-        .delete_objects(&[path.clone(), path_dest.clone()])
-        .await
-        .with_context(|| format!("{path:?} removal"))?;
-
-    Ok(())
-}
-
 struct EnabledAzure {
    client: Arc<GenericRemoteStorage>,
    base_prefix: &'static str,
@@ -319,13 +41,13 @@ impl EnabledAzure {
    }
 }

-enum MaybeEnabledAzure {
+enum MaybeEnabledStorage {
    Enabled(EnabledAzure),
    Disabled,
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledAzure {
+impl AsyncTestContext for MaybeEnabledStorage {
    async fn setup() -> Self {
        ensure_logging_ready();

@@ -341,7 +63,7 @@ impl AsyncTestContext for MaybeEnabledAzure {
    }
 }

-enum MaybeEnabledAzureWithTestBlobs {
+enum MaybeEnabledStorageWithTestBlobs {
    Enabled(AzureWithTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, AzureWithTestBlobs),
@@ -354,7 +76,7 @@ struct AzureWithTestBlobs {
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs {
+impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
@@ -405,7 +127,7 @@ impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs {
 // However, they are not idential. The list_prefixes function is concerned with listing prefixes,
 // whereas the list_files function is concerned with listing files.
 // See `RemoteStorage::list_files` documentation for more details
-enum MaybeEnabledAzureWithSimpleTestBlobs {
+enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(AzureWithSimpleTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, AzureWithSimpleTestBlobs),
@@ -416,7 +138,7 @@ struct AzureWithSimpleTestBlobs {
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs {
+impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -6,297 +6,23 @@ use std::sync::Arc;
 use std::time::UNIX_EPOCH;

 use anyhow::Context;
-use camino::Utf8Path;
 use remote_storage::{
    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
-use test_context::{test_context, AsyncTestContext};
-use tracing::{debug, info};
+use test_context::AsyncTestContext;
+use tracing::info;

 mod common;

-use common::{
-    cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
-    upload_stream, wrap_stream,
-};
+#[path = "common/tests.rs"]
+mod tests_s3;
+
+use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data};

 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";

 const BASE_PREFIX: &str = "test";

-/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
-/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
-/// See the client creation in [`create_s3_client`] for details on the required env vars.
-/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
-/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
-///
-/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
-/// where
-/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
-/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
-///
-/// Then, verifies that the client does return correct prefixes when queried:
-/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
-/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
-///
-/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
-/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
-/// since current default AWS S3 pagination limit is 1000.
-/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
-///
-/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
-/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
-#[test_context(MaybeEnabledS3WithTestBlobs)]
-#[tokio::test]
-async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledS3WithTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledS3WithTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledS3WithTestBlobs::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
-    };
-
-    let test_client = Arc::clone(&ctx.enabled.client);
-    let expected_remote_prefixes = ctx.remote_prefixes.clone();
-
-    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
-        .context("common_prefix construction")?;
-    let root_remote_prefixes = test_client
-        .list_prefixes(None)
-        .await
-        .context("client list root prefixes failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
-        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
-    );
-
-    let nested_remote_prefixes = test_client
-        .list_prefixes(Some(&base_prefix))
-        .await
-        .context("client list nested prefixes failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    let remote_only_prefixes = nested_remote_prefixes
-        .difference(&expected_remote_prefixes)
-        .collect::<HashSet<_>>();
-    let missing_uploaded_prefixes = expected_remote_prefixes
-        .difference(&nested_remote_prefixes)
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
-        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
-    );
-
-    Ok(())
-}
-
-/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries.
-/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
-/// See `s3_pagination_should_work` for more information.
-///
-/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
-/// Then performs the following queries:
-///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
-///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
-#[test_context(MaybeEnabledS3WithSimpleTestBlobs)]
-#[tokio::test]
-async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledS3WithSimpleTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledS3WithSimpleTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledS3WithSimpleTestBlobs::UploadsFailed(e, _) => {
-            anyhow::bail!("S3 init failed: {e:?}")
-        }
-    };
-    let test_client = Arc::clone(&ctx.enabled.client);
-    let base_prefix =
-        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
-    let root_files = test_client
-        .list_files(None)
-        .await
-        .context("client list root files failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        root_files,
-        ctx.remote_blobs.clone(),
-        "remote storage list_files on root mismatches with the uploads."
-    );
-    let nested_remote_files = test_client
-        .list_files(Some(&base_prefix))
-        .await
-        .context("client list nested files failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    let trim_remote_blobs: HashSet<_> = ctx
-        .remote_blobs
-        .iter()
-        .map(|x| x.get_path())
-        .filter(|x| x.starts_with("folder1"))
-        .map(|x| RemotePath::new(x).expect("must be valid path"))
-        .collect();
-    assert_eq!(
-        nested_remote_files, trim_remote_blobs,
-        "remote storage list_files on subdirrectory mismatches with the uploads."
-    );
-    Ok(())
-}
-
-#[test_context(MaybeEnabledS3)]
-#[tokio::test]
-async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledS3::Enabled(ctx) => ctx,
-        MaybeEnabledS3::Disabled => return Ok(()),
-    };
-
-    let path = RemotePath::new(Utf8Path::new(
-        format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
-    ))
-    .with_context(|| "RemotePath conversion")?;
-
-    ctx.client.delete(&path).await.expect("should succeed");
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledS3)]
-#[tokio::test]
-async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledS3::Enabled(ctx) => ctx,
-        MaybeEnabledS3::Disabled => return Ok(()),
-    };
-
-    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
-    ctx.client.upload(data, len, &path1, None).await?;
-
-    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
-    ctx.client.upload(data, len, &path2, None).await?;
-
-    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
-    ctx.client.upload(data, len, &path3, None).await?;
-
-    ctx.client.delete_objects(&[path1, path2]).await?;
-
-    let prefixes = ctx.client.list_prefixes(None).await?;
-
-    assert_eq!(prefixes.len(), 1);
-
-    ctx.client.delete_objects(&[path3]).await?;
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledS3)]
-#[tokio::test]
-async fn s3_upload_download_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
-    let MaybeEnabledS3::Enabled(ctx) = ctx else {
-        return Ok(());
-    };
-
-    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
-
-    let (data, len) = wrap_stream(orig.clone());
-
-    ctx.client.upload(data, len, &path, None).await?;
-
-    // Normal download request
-    let dl = ctx.client.download(&path).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    // Full range (end specified)
-    let dl = ctx
-        .client
-        .download_byte_range(&path, 0, Some(len as u64))
-        .await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    // partial range (end specified)
-    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[4..10]);
-
-    // partial range (end beyond real end)
-    let dl = ctx
-        .client
-        .download_byte_range(&path, 8, Some(len as u64 * 100))
-        .await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[8..]);
-
-    // Partial range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[4..]);
-
-    // Full range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    debug!("Cleanup: deleting file at path {path:?}");
-    ctx.client
-        .delete(&path)
-        .await
-        .with_context(|| format!("{path:?} removal"))?;
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledS3)]
-#[tokio::test]
-async fn s3_copy_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
-    let MaybeEnabledS3::Enabled(ctx) = ctx else {
-        return Ok(());
-    };
-
-    let path = RemotePath::new(Utf8Path::new(
-        format!("{}/file_to_copy", ctx.base_prefix).as_str(),
-    ))
-    .with_context(|| "RemotePath conversion")?;
-    let path_dest = RemotePath::new(Utf8Path::new(
-        format!("{}/file_dest", ctx.base_prefix).as_str(),
-    ))
-    .with_context(|| "RemotePath conversion")?;
-
-    let orig = bytes::Bytes::from_static("remote blob data content".as_bytes());
-
-    let (data, len) = wrap_stream(orig.clone());
-
-    ctx.client.upload(data, len, &path, None).await?;
-
-    // Normal download request
-    ctx.client.copy_object(&path, &path_dest).await?;
-
-    let dl = ctx.client.download(&path_dest).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    debug!("Cleanup: deleting file at path {path:?}");
-    ctx.client
-        .delete_objects(&[path.clone(), path_dest.clone()])
-        .await
-        .with_context(|| format!("{path:?} removal"))?;
-
-    Ok(())
-}
-
 struct EnabledS3 {
    client: Arc<GenericRemoteStorage>,
    base_prefix: &'static str,
@@ -315,13 +41,13 @@ impl EnabledS3 {
    }
 }

-enum MaybeEnabledS3 {
+enum MaybeEnabledStorage {
    Enabled(EnabledS3),
    Disabled,
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledS3 {
+impl AsyncTestContext for MaybeEnabledStorage {
    async fn setup() -> Self {
        ensure_logging_ready();

@@ -337,7 +63,7 @@ impl AsyncTestContext for MaybeEnabledS3 {
    }
 }

-enum MaybeEnabledS3WithTestBlobs {
+enum MaybeEnabledStorageWithTestBlobs {
    Enabled(S3WithTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, S3WithTestBlobs),
@@ -350,7 +76,7 @@ struct S3WithTestBlobs {
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
+impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
@@ -401,7 +127,7 @@ impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
 // However, they are not idential. The list_prefixes function is concerned with listing prefixes,
 // whereas the list_files function is concerned with listing files.
 // See `RemoteStorage::list_files` documentation for more details
-enum MaybeEnabledS3WithSimpleTestBlobs {
+enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(S3WithSimpleTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, S3WithSimpleTestBlobs),
@@ -412,7 +138,7 @@ struct S3WithSimpleTestBlobs {
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
+impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -1,3 +1,4 @@
+use std::num::ParseIntError;
 use std::{fmt, str::FromStr};

 use anyhow::Context;
@@ -374,6 +375,13 @@ impl fmt::Display for NodeId {
    }
 }

+impl FromStr for NodeId {
+    type Err = ParseIntError;
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Ok(NodeId(u64::from_str(s)?))
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use serde_assert::{Deserializer, Serializer, Token, Tokens};
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -28,8 +28,8 @@ pub enum Error {

 pub type Result<T> = std::result::Result<T, Error>;

-pub(crate) trait ResponseErrorMessageExt: Sized {
-    async fn error_from_body(self) -> Result<Self>;
+pub trait ResponseErrorMessageExt: Sized {
+    fn error_from_body(self) -> impl std::future::Future<Output = Result<Self>> + Send;
 }

 impl ResponseErrorMessageExt for reqwest::Response {
@@ -209,14 +209,23 @@ impl Client {
        Ok(())
    }

+    pub async fn list_location_config(&self) -> Result<LocationConfigListResponse> {
+        let path = format!("{}/v1/location_config", self.mgmt_api_endpoint);
+        self.request(Method::GET, &path, ())
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
    pub async fn timeline_create(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        req: &TimelineCreateRequest,
    ) -> Result<TimelineInfo> {
        let uri = format!(
            "{}/v1/tenant/{}/timeline",
-            self.mgmt_api_endpoint, tenant_id
+            self.mgmt_api_endpoint, tenant_shard_id
        );
        self.request(Method::POST, &uri, req)
            .await?
@@ -237,6 +246,21 @@ impl Client {
            .map_err(Error::ReceiveBody)
    }

+    pub async fn timeline_list(
+        &self,
+        tenant_shard_id: &TenantShardId,
+    ) -> Result<Vec<TimelineInfo>> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline",
+            self.mgmt_api_endpoint, tenant_shard_id
+        );
+        self.get(&uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
    pub async fn tenant_synthetic_size(
        &self,
        tenant_shard_id: TenantShardId,
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -108,9 +108,32 @@ pub struct RelTagBlockNo {
 }

 impl PagestreamClient {
-    pub async fn shutdown(mut self) {
-        let _ = self.cancel_on_client_drop.take();
-        self.conn_task.await.unwrap();
+    pub async fn shutdown(self) {
+        let Self {
+            copy_both,
+            cancel_on_client_drop: cancel_conn_task,
+            conn_task,
+        } = self;
+        // The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`.
+        // When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection.
+        // (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56).
+        //
+        // If we drop(copy_both) first, but then immediately drop the `cancel_on_client_drop`,
+        // the CopyFail mesage only makes it to the socket sometimes (i.e., it's a race).
+        //
+        // Further, the pageserver makes a lot of noise when it receives CopyFail.
+        // Computes don't send it in practice, they just hard-close the connection.
+        //
+        // So, let's behave like the computes and suppress the CopyFail as follows:
+        // kill the socket first, then drop copy_both.
+        //
+        // See also: https://www.postgresql.org/docs/current/protocol-flow.html#PROTOCOL-COPY
+        //
+        // NB: page_service doesn't have a use case to exit the `pagestream` mode currently.
+        // => https://github.com/neondatabase/neon/issues/6390
+        let _ = cancel_conn_task.unwrap();
+        conn_task.await.unwrap();
+        drop(copy_both);
    }

    pub async fn getpage(
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -404,23 +404,27 @@ async fn client(
        .await
        .unwrap();

-    start_work_barrier.wait().await;
-
-    while let Some(req) =
-        tokio::select! { work = work.recv() => { work } , _ = cancel.cancelled() => { return; } }
-    {
-        let start = Instant::now();
-
-        let res = tokio::select! {
-            res = client.getpage(req) => { res },
-            _ = cancel.cancelled() => { return; }
-        };
-        res.with_context(|| format!("getpage for {timeline}"))
-            .unwrap();
-        let elapsed = start.elapsed();
-        live_stats.inc();
-        STATS.with(|stats| {
-            stats.borrow().lock().unwrap().observe(elapsed).unwrap();
-        });
+    let do_requests = async {
+        start_work_barrier.wait().await;
+        while let Some(req) = work.recv().await {
+            let start = Instant::now();
+            client
+                .getpage(req)
+                .await
+                .with_context(|| format!("getpage for {timeline}"))
+                .unwrap();
+            let elapsed = start.elapsed();
+            live_stats.inc();
+            STATS.with(|stats| {
+                stats.borrow().lock().unwrap().observe(elapsed).unwrap();
+            });
+        }
+    };
+    tokio::select! {
+        res = do_requests => { res },
+        _ = cancel.cancelled() => {
+            client.shutdown().await;
+            return;
+        }
    }
 }
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -35,6 +35,7 @@ fn main() {
        logging::Output::Stderr,
    )
    .unwrap();
+    logging::replace_panic_hook_with_tracing_panic_hook().forget();

    let args = Args::parse();
    match args {
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -1126,11 +1126,12 @@ mod tests {
    };

    use camino_tempfile::{tempdir, Utf8TempDir};
+    use pageserver_api::models::EvictionPolicy;
    use remote_storage::{RemoteStorageKind, S3Config};
    use utils::serde_percent::Percent;

    use super::*;
-    use crate::{tenant::config::EvictionPolicy, DEFAULT_PG_VERSION};
+    use crate::DEFAULT_PG_VERSION;

    const ALL_BASE_VALUES_TOML: &str = r#"
 # Initial configuration file created by 'pageserver --init'
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -386,39 +386,56 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // If we get far enough in the list that we start to evict layers that are below
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
-    let mut warned = None;
-    let mut usage_planned = usage_pre;
-    let mut evicted_amount = 0;

-    for (i, (partition, candidate)) in candidates.iter().enumerate() {
-        if !usage_planned.has_pressure() {
-            debug!(
-                no_candidates_evicted = i,
-                "took enough candidates for pressure to be relieved"
-            );
-            break;
+    let selection = select_victims(&candidates, usage_pre);
+
+    let mut candidates = candidates;
+
+    let selection = if matches!(eviction_order, EvictionOrder::RelativeAccessed { .. }) {
+        // we currently have the layers ordered by AbsoluteAccessed so that we can get the summary
+        // for comparison here. this is a temporary measure to develop alternatives.
+        use std::fmt::Write;
+
+        let mut summary_buf = String::with_capacity(256);
+
+        {
+            let absolute_summary = candidates
+                .iter()
+                .take(selection.amount)
+                .map(|(_, candidate)| candidate)
+                .collect::<summary::EvictionSummary>();
+
+            write!(summary_buf, "{absolute_summary}").expect("string grows");
+
+            info!("absolute accessed selection summary: {summary_buf}");
        }

-        if partition == &MinResidentSizePartition::Below && warned.is_none() {
-            warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
-            warned = Some(usage_planned);
+        candidates.sort_unstable_by_key(|(partition, candidate)| {
+            (*partition, candidate.relative_last_activity)
+        });
+
+        let selection = select_victims(&candidates, usage_pre);
+
+        {
+            summary_buf.clear();
+
+            let relative_summary = candidates
+                .iter()
+                .take(selection.amount)
+                .map(|(_, candidate)| candidate)
+                .collect::<summary::EvictionSummary>();
+
+            write!(summary_buf, "{relative_summary}").expect("string grows");
+
+            info!("relative accessed selection summary: {summary_buf}");
        }

-        usage_planned.add_available_bytes(candidate.layer.get_file_size());
-        evicted_amount += 1;
-    }
-
-    let usage_planned = match warned {
-        Some(respecting_tenant_min_resident_size) => PlannedUsage {
-            respecting_tenant_min_resident_size,
-            fallback_to_global_lru: Some(usage_planned),
-        },
-        None => PlannedUsage {
-            respecting_tenant_min_resident_size: usage_planned,
-            fallback_to_global_lru: None,
-        },
+        selection
+    } else {
+        selection
    };
-    debug!(?usage_planned, "usage planned");
+
+    let (evicted_amount, usage_planned) = selection.into_amount_and_planned();

    // phase2: evict layers

@@ -796,14 +813,16 @@ async fn collect_eviction_candidates(
        // A default override can be put in the default tenant conf in the pageserver.toml.
        let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
            debug!(
-                tenant_id=%tenant.tenant_id(),
+                tenant_id=%tenant.tenant_shard_id().tenant_id,
+                shard_id=%tenant.tenant_shard_id().shard_slug(),
                overridden_size=s,
                "using overridden min resident size for tenant"
            );
            s
        } else {
            debug!(
-                tenant_id=%tenant.tenant_id(),
+                tenant_id=%tenant.tenant_shard_id().tenant_id,
+                shard_id=%tenant.tenant_shard_id().shard_slug(),
                max_layer_size,
                "using max layer size as min_resident_size for tenant",
            );
@@ -908,22 +927,80 @@ async fn collect_eviction_candidates(
    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
        "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");

-    match eviction_order {
-        EvictionOrder::AbsoluteAccessed => {
-            candidates.sort_unstable_by_key(|(partition, candidate)| {
-                (*partition, candidate.last_activity_ts)
-            });
-        }
-        EvictionOrder::RelativeAccessed { .. } => {
-            candidates.sort_unstable_by_key(|(partition, candidate)| {
-                (*partition, candidate.relative_last_activity)
-            });
-        }
-    }
+    // always behave as if AbsoluteAccessed was selected. if RelativeAccessed is in use, we
+    // will sort later by candidate.relative_last_activity to get compare evictions.
+    candidates
+        .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));

    Ok(EvictionCandidates::Finished(candidates))
 }

+/// Given a pre-sorted vec of all layers in the system, select the first N which are enough to
+/// relieve pressure.
+///
+/// Returns the amount of candidates selected, with the planned usage.
+fn select_victims<U: Usage>(
+    candidates: &[(MinResidentSizePartition, EvictionCandidate)],
+    usage_pre: U,
+) -> VictimSelection<U> {
+    let mut usage_when_switched = None;
+    let mut usage_planned = usage_pre;
+    let mut evicted_amount = 0;
+
+    for (i, (partition, candidate)) in candidates.iter().enumerate() {
+        if !usage_planned.has_pressure() {
+            break;
+        }
+
+        if partition == &MinResidentSizePartition::Below && usage_when_switched.is_none() {
+            usage_when_switched = Some((usage_planned, i));
+        }
+
+        usage_planned.add_available_bytes(candidate.layer.get_file_size());
+        evicted_amount += 1;
+    }
+
+    VictimSelection {
+        amount: evicted_amount,
+        usage_pre,
+        usage_when_switched,
+        usage_planned,
+    }
+}
+
+struct VictimSelection<U> {
+    amount: usize,
+    usage_pre: U,
+    usage_when_switched: Option<(U, usize)>,
+    usage_planned: U,
+}
+
+impl<U: Usage> VictimSelection<U> {
+    fn into_amount_and_planned(self) -> (usize, PlannedUsage<U>) {
+        debug!(
+            evicted_amount=%self.amount,
+            "took enough candidates for pressure to be relieved"
+        );
+
+        if let Some((usage_planned, candidate_no)) = self.usage_when_switched.as_ref() {
+            warn!(usage_pre=?self.usage_pre, ?usage_planned, candidate_no, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
+        }
+
+        let planned = match self.usage_when_switched {
+            Some((respecting_tenant_min_resident_size, _)) => PlannedUsage {
+                respecting_tenant_min_resident_size,
+                fallback_to_global_lru: Some(self.usage_planned),
+            },
+            None => PlannedUsage {
+                respecting_tenant_min_resident_size: self.usage_planned,
+                fallback_to_global_lru: None,
+            },
+        };
+
+        (self.amount, planned)
+    }
+}
+
 struct TimelineKey(Arc<Timeline>);

 impl PartialEq for TimelineKey {
@@ -1008,6 +1085,137 @@ pub(crate) mod finite_f32 {
    }
 }

+mod summary {
+    use super::finite_f32::FiniteF32;
+    use super::{EvictionCandidate, LayerCount};
+    use pageserver_api::shard::TenantShardId;
+    use std::collections::{BTreeMap, HashMap};
+    use std::time::SystemTime;
+
+    #[derive(Debug, Default)]
+    pub(super) struct EvictionSummary {
+        evicted_per_tenant: HashMap<TenantShardId, LayerCount>,
+        total: LayerCount,
+
+        last_absolute: Option<SystemTime>,
+        last_relative: Option<FiniteF32>,
+    }
+
+    impl<'a> FromIterator<&'a EvictionCandidate> for EvictionSummary {
+        fn from_iter<T: IntoIterator<Item = &'a EvictionCandidate>>(iter: T) -> Self {
+            let mut summary = EvictionSummary::default();
+            for item in iter {
+                let counts = summary
+                    .evicted_per_tenant
+                    .entry(*item.layer.get_tenant_shard_id())
+                    .or_default();
+
+                let sz = item.layer.get_file_size();
+
+                counts.file_sizes += sz;
+                counts.count += 1;
+
+                summary.total.file_sizes += sz;
+                summary.total.count += 1;
+
+                summary.last_absolute = Some(item.last_activity_ts);
+                summary.last_relative = Some(item.relative_last_activity);
+            }
+
+            summary
+        }
+    }
+
+    struct SiBytesAmount(u64);
+
+    impl std::fmt::Display for SiBytesAmount {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            if self.0 < 1024 {
+                return write!(f, "{}B", self.0);
+            }
+
+            let mut tmp = self.0;
+            let mut ch = 0;
+            let suffixes = b"KMGTPE";
+
+            while tmp > 1024 * 1024 && ch < suffixes.len() - 1 {
+                tmp /= 1024;
+                ch += 1;
+            }
+
+            let ch = suffixes[ch] as char;
+
+            write!(f, "{:.1}{ch}iB", tmp as f64 / 1024.0)
+        }
+    }
+
+    impl std::fmt::Display for EvictionSummary {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            // wasteful, but it's for testing
+
+            let mut sorted: BTreeMap<usize, Vec<(TenantShardId, u64)>> = BTreeMap::new();
+
+            for (tenant_shard_id, count) in &self.evicted_per_tenant {
+                sorted
+                    .entry(count.count)
+                    .or_default()
+                    .push((*tenant_shard_id, count.file_sizes));
+            }
+
+            let total_file_sizes = SiBytesAmount(self.total.file_sizes);
+
+            writeln!(
+                f,
+                "selected {} layers of {total_file_sizes} up to ({:?}, {:.2?}):",
+                self.total.count, self.last_absolute, self.last_relative,
+            )?;
+
+            for (count, per_tenant) in sorted.iter().rev().take(10) {
+                write!(f, "- {count} layers: ")?;
+
+                if per_tenant.len() < 3 {
+                    for (i, (tenant_shard_id, bytes)) in per_tenant.iter().enumerate() {
+                        if i > 0 {
+                            write!(f, ", ")?;
+                        }
+                        let bytes = SiBytesAmount(*bytes);
+                        write!(f, "{tenant_shard_id} ({bytes})")?;
+                    }
+                } else {
+                    let num_tenants = per_tenant.len();
+                    let total_bytes = per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>();
+                    let total_bytes = SiBytesAmount(total_bytes);
+                    let layers = num_tenants * count;
+
+                    write!(
+                        f,
+                        "{num_tenants} tenants {total_bytes} in total {layers} layers",
+                    )?;
+                }
+
+                writeln!(f)?;
+            }
+
+            if sorted.len() > 10 {
+                let (rem_count, rem_bytes) = sorted
+                    .iter()
+                    .rev()
+                    .map(|(count, per_tenant)| {
+                        (
+                            count,
+                            per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>(),
+                        )
+                    })
+                    .fold((0, 0), |acc, next| (acc.0 + next.0, acc.1 + next.1));
+                let rem_bytes = SiBytesAmount(rem_bytes);
+                writeln!(f, "- rest of tenants ({}) not shown ({rem_count} layers or {:.1}%, {rem_bytes} or {:.1}% bytes)", sorted.len() - 10, 100.0 * rem_count as f64 / self.total.count as f64, 100.0 * rem_bytes.0 as f64 / self.total.file_sizes as f64)?;
+            }
+
+            Ok(())
+        }
+    }
+}
+
 mod filesystem_level_usage {
    use anyhow::Context;
    use camino::Utf8Path;
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -14,6 +14,7 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
+use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantState;
@@ -39,11 +40,11 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::GetActiveTenantError;
-use crate::tenant::mgr::UpsertLocationError;
 use crate::tenant::mgr::{
    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
    TenantSlotError, TenantSlotUpsertError, TenantStateError,
 };
+use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -1235,7 +1236,7 @@ async fn tenant_create_handler(

    json_response(
        StatusCode::CREATED,
-        TenantCreateResponse(new_tenant.tenant_id()),
+        TenantCreateResponse(new_tenant.tenant_shard_id().tenant_id),
    )
 }

@@ -1354,6 +1355,28 @@ async fn put_tenant_location_config_handler(
    json_response(StatusCode::OK, ())
 }

+async fn list_location_config_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&request);
+    let slots = state.tenant_manager.list();
+    let result = LocationConfigListResponse {
+        tenant_shards: slots
+            .into_iter()
+            .map(|(tenant_shard_id, slot)| {
+                let v = match slot {
+                    TenantSlot::Attached(t) => Some(t.get_location_conf()),
+                    TenantSlot::Secondary(s) => Some(s.get_location_conf()),
+                    TenantSlot::InProgress(_) => None,
+                };
+                (tenant_shard_id, v)
+            })
+            .collect(),
+    };
+    json_response(StatusCode::OK, result)
+}
+
 /// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
 async fn handle_tenant_break(
    r: Request<Body>,
@@ -1896,6 +1919,9 @@ pub fn make_router(
        .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
            api_handler(r, put_tenant_location_config_handler)
        })
+        .get("/v1/location_config", |r| {
+            api_handler(r, list_location_config_handler)
+        })
        .get("/v1/tenant/:tenant_shard_id/timeline", |r| {
            api_handler(r, timeline_list_handler)
        })
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -11,7 +11,7 @@ use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
 use strum::{EnumCount, IntoEnumIterator, VariantNames};
 use strum_macros::{EnumVariantNames, IntoStaticStr};
-use utils::id::{TenantId, TimelineId};
+use utils::id::TimelineId;

 /// Prometheus histogram buckets (in seconds) for operations in the critical
 /// path. In other words, operations that directly affect that latency of user
@@ -59,7 +59,7 @@ pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(||
    register_counter_vec!(
        "pageserver_storage_operations_seconds_sum",
        "Total time spent on storage operations with operation, tenant and timeline dimensions",
-        &["operation", "tenant_id", "timeline_id"],
+        &["operation", "tenant_id", "shard_id", "timeline_id"],
    )
    .expect("failed to define a metric")
 });
@@ -68,7 +68,7 @@ pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::n
    register_int_counter_vec!(
        "pageserver_storage_operations_seconds_count",
        "Count of storage operations with operation, tenant and timeline dimensions",
-        &["operation", "tenant_id", "timeline_id"],
+        &["operation", "tenant_id", "shard_id", "timeline_id"],
    )
    .expect("failed to define a metric")
 });
@@ -373,7 +373,7 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_last_record_lsn",
        "Last record LSN grouped by timeline",
-        &["tenant_id", "timeline_id"]
+        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -382,7 +382,7 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_resident_physical_size",
        "The size of the layer files present in the pageserver's filesystem.",
-        &["tenant_id", "timeline_id"]
+        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -400,7 +400,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
        "pageserver_remote_physical_size",
        "The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
        // Corollary: If any files are missing from the index part, they won't be included here.
-        &["tenant_id", "timeline_id"]
+        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -433,7 +433,7 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_current_logical_size",
        "Current logical size grouped by timeline",
-        &["tenant_id", "timeline_id"]
+        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define current logical size metric")
 });
@@ -582,7 +582,7 @@ pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_broken_tenants_count",
        "Set of broken tenants",
-        &["tenant_id"]
+        &["tenant_id", "shard_id"]
    )
    .expect("Failed to register pageserver_tenant_states_count metric")
 });
@@ -602,7 +602,7 @@ static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_created_persistent_files_total",
        "Number of files created that are meant to be uploaded to cloud storage",
-        &["tenant_id", "timeline_id"]
+        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -611,7 +611,7 @@ static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_written_persistent_bytes_total",
        "Total bytes written that are meant to be uploaded to cloud storage",
-        &["tenant_id", "timeline_id"]
+        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -630,7 +630,7 @@ static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_evictions",
        "Number of layers evicted from the pageserver",
-        &["tenant_id", "timeline_id"]
+        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -927,7 +927,7 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_io_operations_bytes_total",
        "Total amount of bytes read/written in IO operations",
-        &["operation", "tenant_id", "timeline_id"]
+        &["operation", "tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -1002,7 +1002,7 @@ static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds",
        "Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
-        &["smgr_query_type", "tenant_id", "timeline_id"],
+        &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
@@ -1069,8 +1069,9 @@ static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
 });

 impl SmgrQueryTimePerTimeline {
-    pub(crate) fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
-        let tenant_id = tenant_id.to_string();
+    pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
+        let tenant_id = tenant_shard_id.tenant_id.to_string();
+        let shard_slug = format!("{}", tenant_shard_id.shard_slug());
        let timeline_id = timeline_id.to_string();
        let metrics = std::array::from_fn(|i| {
            let op = SmgrQueryType::from_repr(i).unwrap();
@@ -1078,7 +1079,7 @@ impl SmgrQueryTimePerTimeline {
                .get_metric_with_label_values(&[op.into()])
                .unwrap();
            let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
-                .get_metric_with_label_values(&[op.into(), &tenant_id, &timeline_id])
+                .get_metric_with_label_values(&[op.into(), &tenant_id, &shard_slug, &timeline_id])
                .unwrap();
            GlobalAndPerTimelineHistogram {
                global,
@@ -1098,6 +1099,7 @@ impl SmgrQueryTimePerTimeline {

 #[cfg(test)]
 mod smgr_query_time_tests {
+    use pageserver_api::shard::TenantShardId;
    use strum::IntoEnumIterator;
    use utils::id::{TenantId, TimelineId};

@@ -1124,7 +1126,10 @@ mod smgr_query_time_tests {
        for op in &ops {
            let tenant_id = TenantId::generate();
            let timeline_id = TimelineId::generate();
-            let metrics = super::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
+            let metrics = super::SmgrQueryTimePerTimeline::new(
+                &TenantShardId::unsharded(tenant_id),
+                &timeline_id,
+            );

            let get_counts = || {
                let global: u64 = ops
@@ -1205,7 +1210,13 @@ static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::
        "Number of ongoing calls to remote timeline client. \
         Used to populate pageserver_remote_timeline_client_calls_started. \
         This metric is not useful for sampling from Prometheus, but useful in tests.",
-        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+        &[
+            "tenant_id",
+            "shard_id",
+            "timeline_id",
+            "file_kind",
+            "op_kind"
+        ],
    )
    .expect("failed to define a metric")
 });
@@ -1226,22 +1237,23 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
    .expect("failed to define a metric")
 });

-static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
+static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> =
+    Lazy::new(|| {
+        register_int_counter_vec!(
        "pageserver_remote_timeline_client_bytes_started",
        "Incremented by the number of bytes associated with a remote timeline client operation. \
         The increment happens when the operation is scheduled.",
-        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+        &["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"],
    )
-    .expect("failed to define a metric")
-});
+        .expect("failed to define a metric")
+    });

 static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_remote_timeline_client_bytes_finished",
        "Incremented by the number of bytes associated with a remote timeline client operation. \
         The increment happens when the operation finishes (regardless of success/failure/shutdown).",
-        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+        &["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"],
    )
    .expect("failed to define a metric")
 });
@@ -1687,14 +1699,19 @@ pub(crate) struct StorageTimeMetrics {
 }

 impl StorageTimeMetrics {
-    pub fn new(operation: StorageTimeOperation, tenant_id: &str, timeline_id: &str) -> Self {
+    pub fn new(
+        operation: StorageTimeOperation,
+        tenant_id: &str,
+        shard_id: &str,
+        timeline_id: &str,
+    ) -> Self {
        let operation: &'static str = operation.into();

        let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE
-            .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
+            .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
            .unwrap();
        let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
-            .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
+            .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
            .unwrap();
        let global_histogram = STORAGE_TIME_GLOBAL
            .get_metric_with_label_values(&[operation])
@@ -1746,40 +1763,66 @@ impl TimelineMetrics {
        let tenant_id = tenant_shard_id.tenant_id.to_string();
        let shard_id = format!("{}", tenant_shard_id.shard_slug());
        let timeline_id = timeline_id.to_string();
-        let flush_time_histo =
-            StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
-        let compact_time_histo =
-            StorageTimeMetrics::new(StorageTimeOperation::Compact, &tenant_id, &timeline_id);
-        let create_images_time_histo =
-            StorageTimeMetrics::new(StorageTimeOperation::CreateImages, &tenant_id, &timeline_id);
-        let logical_size_histo =
-            StorageTimeMetrics::new(StorageTimeOperation::LogicalSize, &tenant_id, &timeline_id);
+        let flush_time_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::LayerFlush,
+            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
+        let compact_time_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::Compact,
+            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
+        let create_images_time_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::CreateImages,
+            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
+        let logical_size_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::LogicalSize,
+            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
        let imitate_logical_size_histo = StorageTimeMetrics::new(
            StorageTimeOperation::ImitateLogicalSize,
            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
+        let load_layer_map_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::LoadLayerMap,
+            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
+        let garbage_collect_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::Gc,
+            &tenant_id,
+            &shard_id,
            &timeline_id,
        );
-        let load_layer_map_histo =
-            StorageTimeMetrics::new(StorageTimeOperation::LoadLayerMap, &tenant_id, &timeline_id);
-        let garbage_collect_histo =
-            StorageTimeMetrics::new(StorageTimeOperation::Gc, &tenant_id, &timeline_id);
        let last_record_gauge = LAST_RECORD_LSN
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
+        // TODO: we shouldn't expose this metric
        let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
        let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
        let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
        let evictions = EVICTIONS
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
        let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
            .build(&tenant_id, &shard_id, &timeline_id);
@@ -1833,15 +1876,17 @@ impl Drop for TimelineMetrics {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
-        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
        {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
-            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+            let _ =
+                RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
        }
-        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ =
+            NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]);

        self.evictions_with_low_residence_duration
            .write()
@@ -1854,29 +1899,42 @@ impl Drop for TimelineMetrics {
        // outlive an individual smgr connection, but not the timeline.

        for op in StorageTimeOperation::VARIANTS {
-            let _ =
-                STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
-            let _ =
-                STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
+            let _ = STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[
+                op,
+                tenant_id,
+                shard_id,
+                timeline_id,
+            ]);
+            let _ = STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[
+                op,
+                tenant_id,
+                shard_id,
+                timeline_id,
+            ]);
        }

        for op in STORAGE_IO_SIZE_OPERATIONS {
-            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
+            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
        }

        for op in SmgrQueryType::iter() {
            let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
                op.into(),
                tenant_id,
+                shard_id,
                timeline_id,
            ]);
        }
    }
 }

-pub fn remove_tenant_metrics(tenant_id: &TenantId) {
-    let tid = tenant_id.to_string();
-    let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
+pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
+    // Only shard zero deals in synthetic sizes
+    if tenant_shard_id.is_zero() {
+        let tid = tenant_shard_id.tenant_id.to_string();
+        let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
+    }
+
    // we leave the BROKEN_TENANTS_SET entry if any
 }

@@ -1926,6 +1984,7 @@ impl Drop for PerTimelineRemotePhysicalSizeGauge {

 pub(crate) struct RemoteTimelineClientMetrics {
    tenant_id: String,
+    shard_id: String,
    timeline_id: String,
    remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
@@ -1937,6 +1996,7 @@ impl RemoteTimelineClientMetrics {
    pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
        RemoteTimelineClientMetrics {
            tenant_id: tenant_shard_id.tenant_id.to_string(),
+            shard_id: format!("{}", tenant_shard_id.shard_slug()),
            timeline_id: timeline_id.to_string(),
            calls_unfinished_gauge: Mutex::new(HashMap::default()),
            bytes_started_counter: Mutex::new(HashMap::default()),
@@ -1951,8 +2011,9 @@ impl RemoteTimelineClientMetrics {
            PerTimelineRemotePhysicalSizeGauge::new(
                REMOTE_PHYSICAL_SIZE
                    .get_metric_with_label_values(&[
-                        &self.tenant_id.to_string(),
-                        &self.timeline_id.to_string(),
+                        &self.tenant_id,
+                        &self.shard_id,
+                        &self.timeline_id,
                    ])
                    .unwrap(),
            )
@@ -1987,8 +2048,9 @@ impl RemoteTimelineClientMetrics {
        let metric = guard.entry(key).or_insert_with(move || {
            REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
                .get_metric_with_label_values(&[
-                    &self.tenant_id.to_string(),
-                    &self.timeline_id.to_string(),
+                    &self.tenant_id,
+                    &self.shard_id,
+                    &self.timeline_id,
                    key.0,
                    key.1,
                ])
@@ -2018,8 +2080,9 @@ impl RemoteTimelineClientMetrics {
        let metric = guard.entry(key).or_insert_with(move || {
            REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER
                .get_metric_with_label_values(&[
-                    &self.tenant_id.to_string(),
-                    &self.timeline_id.to_string(),
+                    &self.tenant_id,
+                    &self.shard_id,
+                    &self.timeline_id,
                    key.0,
                    key.1,
                ])
@@ -2038,8 +2101,9 @@ impl RemoteTimelineClientMetrics {
        let metric = guard.entry(key).or_insert_with(move || {
            REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER
                .get_metric_with_label_values(&[
-                    &self.tenant_id.to_string(),
-                    &self.timeline_id.to_string(),
+                    &self.tenant_id,
+                    &self.shard_id,
+                    &self.timeline_id,
                    key.0,
                    key.1,
                ])
@@ -2183,6 +2247,7 @@ impl Drop for RemoteTimelineClientMetrics {
    fn drop(&mut self) {
        let RemoteTimelineClientMetrics {
            tenant_id,
+            shard_id,
            timeline_id,
            remote_physical_size_gauge,
            calls_unfinished_gauge,
@@ -2192,6 +2257,7 @@ impl Drop for RemoteTimelineClientMetrics {
        for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
                tenant_id,
+                shard_id,
                timeline_id,
                a,
                b,
@@ -2200,6 +2266,7 @@ impl Drop for RemoteTimelineClientMetrics {
        for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
                tenant_id,
+                shard_id,
                timeline_id,
                a,
                b,
@@ -2208,6 +2275,7 @@ impl Drop for RemoteTimelineClientMetrics {
        for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[
                tenant_id,
+                shard_id,
                timeline_id,
                a,
                b,
@@ -2215,7 +2283,7 @@ impl Drop for RemoteTimelineClientMetrics {
        }
        {
            let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
-            let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+            let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        }
    }
 }
@@ -2225,8 +2293,6 @@ impl Drop for RemoteTimelineClientMetrics {
 pub(crate) trait MeasureRemoteOp: Sized {
    fn measure_remote_op(
        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
        file_kind: RemoteOpFileKind,
        op: RemoteOpKind,
        metrics: Arc<RemoteTimelineClientMetrics>,
@@ -2234,8 +2300,6 @@ pub(crate) trait MeasureRemoteOp: Sized {
        let start = Instant::now();
        MeasuredRemoteOp {
            inner: self,
-            tenant_id,
-            timeline_id,
            file_kind,
            op,
            start,
@@ -2251,8 +2315,6 @@ pin_project! {
    {
        #[pin]
        inner: F,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
        file_kind: RemoteOpFileKind,
        op: RemoteOpKind,
        start: Instant,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -61,7 +61,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
-use crate::pgdatadir_mapping::{rel_block_to_key, Version};
+use crate::pgdatadir_mapping::Version;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -75,6 +75,7 @@ use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;

+use pageserver_api::key::rel_block_to_key;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

@@ -384,11 +385,23 @@ impl PageServerHandler {
        }
    }

-    /// Analogous to calling cancelled() on a Timeline's cancellation token: waits for cancellation.
+    /// Future that completes when we need to shut down the connection.
    ///
-    /// We use many Timeline objects, and hold GateGuards on all of them.  We must therefore respect
-    /// all of their cancellation tokens.
-    async fn timeline_cancelled(&self) {
+    /// We currently need to shut down when any of the following happens:
+    /// 1. any of the timelines we hold GateGuards for in `shard_timelines` is cancelled
+    /// 2. task_mgr requests shutdown of the connection
+    ///
+    /// NB on (1): the connection's lifecycle is not actually tied to any of the
+    /// `shard_timelines`s' lifecycles. But it's _necessary_ in the current
+    /// implementation to be responsive to timeline cancellation because
+    /// the connection holds their `GateGuards` open (sored in `shard_timelines`).
+    /// We currently do the easy thing and terminate the connection if any of the
+    /// shard_timelines gets cancelled. But really, we cuold spend more effort
+    /// and simply remove the cancelled timeline from the `shard_timelines`, thereby
+    /// dropping the guard.
+    ///
+    /// NB: keep in sync with [`Self::is_connection_cancelled`]
+    async fn await_connection_cancelled(&self) {
        // A short wait before we expend the cycles to walk our timeline map.  This avoids incurring
        // that cost every time we check for cancellation.
        tokio::time::sleep(Duration::from_millis(10)).await;
@@ -398,20 +411,26 @@ impl PageServerHandler {
        // immutable &self).  So it's fine to evaluate shard_timelines after the sleep, we don't risk
        // missing any inserts to the map.

-        let mut futs = self
-            .shard_timelines
-            .values()
-            .map(|ht| ht.timeline.cancel.cancelled())
-            .collect::<FuturesUnordered<_>>();
-
-        futs.next().await;
+        let mut cancellation_sources = Vec::with_capacity(1 + self.shard_timelines.len());
+        use futures::future::Either;
+        cancellation_sources.push(Either::Left(task_mgr::shutdown_watcher()));
+        cancellation_sources.extend(
+            self.shard_timelines
+                .values()
+                .map(|ht| Either::Right(ht.timeline.cancel.cancelled())),
+        );
+        FuturesUnordered::from_iter(cancellation_sources)
+            .next()
+            .await;
    }

-    /// Analogous to calling is_cancelled() on a Timeline's cancellation token
-    fn timeline_is_cancelled(&self) -> bool {
-        self.shard_timelines
-            .values()
-            .any(|ht| ht.timeline.cancel.is_cancelled() || ht.timeline.is_stopping())
+    /// Checking variant of [`Self::await_connection_cancelled`].
+    fn is_connection_cancelled(&self) -> bool {
+        task_mgr::is_shutdown_requested()
+            || self
+                .shard_timelines
+                .values()
+                .any(|ht| ht.timeline.cancel.is_cancelled() || ht.timeline.is_stopping())
    }

    /// This function always respects cancellation of any timeline in `[Self::shard_timelines]`.  Pass in
@@ -432,7 +451,7 @@ impl PageServerHandler {
            flush_r = pgb.flush() => {
                Ok(flush_r?)
            },
-            _ = self.timeline_cancelled() => {
+            _ = self.await_connection_cancelled() => {
                Err(QueryError::Shutdown)
            }
            _ = cancel.cancelled() => {
@@ -545,13 +564,11 @@ impl PageServerHandler {
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
        self.flush_cancellable(pgb, &tenant.cancel).await?;

-        let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
-
        loop {
            let msg = tokio::select! {
                biased;

-                _ = self.timeline_cancelled() => {
+                _ = self.await_connection_cancelled() => {
                    // We were requested to shut down.
                    info!("shutdown request received in page handler");
                    return Err(QueryError::Shutdown)
@@ -585,7 +602,6 @@ impl PageServerHandler {

            let (response, span) = match neon_fe_msg {
                PagestreamFeMessage::Exists(req) => {
-                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelExists);
                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.lsn);
                    (
                        self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
@@ -595,7 +611,6 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::Nblocks(req) => {
-                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelSize);
                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.lsn);
                    (
                        self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
@@ -605,7 +620,6 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::GetPage(req) => {
-                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn);
                    (
                        self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
@@ -615,7 +629,6 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::DbSize(req) => {
-                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetDbSize);
                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.lsn);
                    (
                        self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
@@ -638,7 +651,7 @@ impl PageServerHandler {
                    span.in_scope(|| info!("handler requested reconnect: {reason}"));
                    return Err(QueryError::Reconnect);
                }
-                Err(e) if self.timeline_is_cancelled() => {
+                Err(e) if self.is_connection_cancelled() => {
                    // This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean
                    // shutdown error, this may be buried inside a PageReconstructError::Other for example.
                    //
@@ -865,6 +878,9 @@ impl PageServerHandler {
        ctx: &RequestContext,
    ) -> Result<PagestreamBeMessage, PageStreamError> {
        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
+        let _timer = timeline
+            .query_metrics
+            .start_timer(metrics::SmgrQueryType::GetRelExists);

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
@@ -888,6 +904,11 @@ impl PageServerHandler {
        ctx: &RequestContext,
    ) -> Result<PagestreamBeMessage, PageStreamError> {
        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
+
+        let _timer = timeline
+            .query_metrics
+            .start_timer(metrics::SmgrQueryType::GetRelSize);
+
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
@@ -910,6 +931,11 @@ impl PageServerHandler {
        ctx: &RequestContext,
    ) -> Result<PagestreamBeMessage, PageStreamError> {
        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
+
+        let _timer = timeline
+            .query_metrics
+            .start_timer(metrics::SmgrQueryType::GetDbSize);
+
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
@@ -1080,6 +1106,10 @@ impl PageServerHandler {
            }
        };

+        let _timer = timeline
+            .query_metrics
+            .start_timer(metrics::SmgrQueryType::GetPageAtLsn);
+
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -13,7 +13,12 @@ use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes};
-use pageserver_api::key::is_rel_block_key;
+use pageserver_api::key::{
+    dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
+    rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
+    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
+    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
+};
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -1535,366 +1540,6 @@ struct SlruSegmentDirectory {

 static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);

-// Layout of the Key address space
-//
-// The Key struct, used to address the underlying key-value store, consists of
-// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map
-// all the data and metadata keys into those 18 bytes.
-//
-// Principles for the mapping:
-//
-// - Things that are often accessed or modified together, should be close to
-//   each other in the key space. For example, if a relation is extended by one
-//   block, we create a new key-value pair for the block data, and update the
-//   relation size entry. Because of that, the RelSize key comes after all the
-//   RelBlocks of a relation: the RelSize and the last RelBlock are always next
-//   to each other.
-//
-// The key space is divided into four major sections, identified by the first
-// byte, and the form a hierarchy:
-//
-// 00 Relation data and metadata
-//
-//   DbDir    () -> (dbnode, spcnode)
-//   Filenodemap
-//   RelDir   -> relnode forknum
-//       RelBlocks
-//       RelSize
-//
-// 01 SLRUs
-//
-//   SlruDir  kind
-//   SlruSegBlocks segno
-//   SlruSegSize
-//
-// 02 pg_twophase
-//
-// 03 misc
-//    Controlfile
-//    checkpoint
-//    pg_version
-//
-// 04 aux files
-//
-// Below is a full list of the keyspace allocation:
-//
-// DbDir:
-// 00 00000000 00000000 00000000 00   00000000
-//
-// Filenodemap:
-// 00 SPCNODE  DBNODE   00000000 00   00000000
-//
-// RelDir:
-// 00 SPCNODE  DBNODE   00000000 00   00000001 (Postgres never uses relfilenode 0)
-//
-// RelBlock:
-// 00 SPCNODE  DBNODE   RELNODE  FORK BLKNUM
-//
-// RelSize:
-// 00 SPCNODE  DBNODE   RELNODE  FORK FFFFFFFF
-//
-// SlruDir:
-// 01 kind     00000000 00000000 00   00000000
-//
-// SlruSegBlock:
-// 01 kind     00000001 SEGNO    00   BLKNUM
-//
-// SlruSegSize:
-// 01 kind     00000001 SEGNO    00   FFFFFFFF
-//
-// TwoPhaseDir:
-// 02 00000000 00000000 00000000 00   00000000
-//
-// TwoPhaseFile:
-// 02 00000000 00000000 00000000 00   XID
-//
-// ControlFile:
-// 03 00000000 00000000 00000000 00   00000000
-//
-// Checkpoint:
-// 03 00000000 00000000 00000000 00   00000001
-//
-// AuxFiles:
-// 03 00000000 00000000 00000000 00   00000002
-//
-
-//-- Section 01: relation data and metadata
-
-const DBDIR_KEY: Key = Key {
-    field1: 0x00,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 0,
-};
-
-fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
-    Key {
-        field1: 0x00,
-        field2: spcnode,
-        field3: dbnode,
-        field4: 0,
-        field5: 0,
-        field6: 0,
-    }..Key {
-        field1: 0x00,
-        field2: spcnode,
-        field3: dbnode,
-        field4: 0xffffffff,
-        field5: 0xff,
-        field6: 0xffffffff,
-    }
-}
-
-fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key {
-    Key {
-        field1: 0x00,
-        field2: spcnode,
-        field3: dbnode,
-        field4: 0,
-        field5: 0,
-        field6: 0,
-    }
-}
-
-fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
-    Key {
-        field1: 0x00,
-        field2: spcnode,
-        field3: dbnode,
-        field4: 0,
-        field5: 0,
-        field6: 1,
-    }
-}
-
-pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
-    Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum,
-        field6: blknum,
-    }
-}
-
-fn rel_size_to_key(rel: RelTag) -> Key {
-    Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum,
-        field6: 0xffffffff,
-    }
-}
-
-fn rel_key_range(rel: RelTag) -> Range<Key> {
-    Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum,
-        field6: 0,
-    }..Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum + 1,
-        field6: 0,
-    }
-}
-
-//-- Section 02: SLRUs
-
-fn slru_dir_to_key(kind: SlruKind) -> Key {
-    Key {
-        field1: 0x01,
-        field2: match kind {
-            SlruKind::Clog => 0x00,
-            SlruKind::MultiXactMembers => 0x01,
-            SlruKind::MultiXactOffsets => 0x02,
-        },
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: 0,
-    }
-}
-
-fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
-    Key {
-        field1: 0x01,
-        field2: match kind {
-            SlruKind::Clog => 0x00,
-            SlruKind::MultiXactMembers => 0x01,
-            SlruKind::MultiXactOffsets => 0x02,
-        },
-        field3: 1,
-        field4: segno,
-        field5: 0,
-        field6: blknum,
-    }
-}
-
-fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
-    Key {
-        field1: 0x01,
-        field2: match kind {
-            SlruKind::Clog => 0x00,
-            SlruKind::MultiXactMembers => 0x01,
-            SlruKind::MultiXactOffsets => 0x02,
-        },
-        field3: 1,
-        field4: segno,
-        field5: 0,
-        field6: 0xffffffff,
-    }
-}
-
-fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
-    let field2 = match kind {
-        SlruKind::Clog => 0x00,
-        SlruKind::MultiXactMembers => 0x01,
-        SlruKind::MultiXactOffsets => 0x02,
-    };
-
-    Key {
-        field1: 0x01,
-        field2,
-        field3: 1,
-        field4: segno,
-        field5: 0,
-        field6: 0,
-    }..Key {
-        field1: 0x01,
-        field2,
-        field3: 1,
-        field4: segno,
-        field5: 1,
-        field6: 0,
-    }
-}
-
-//-- Section 03: pg_twophase
-
-const TWOPHASEDIR_KEY: Key = Key {
-    field1: 0x02,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 0,
-};
-
-fn twophase_file_key(xid: TransactionId) -> Key {
-    Key {
-        field1: 0x02,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: xid,
-    }
-}
-
-fn twophase_key_range(xid: TransactionId) -> Range<Key> {
-    let (next_xid, overflowed) = xid.overflowing_add(1);
-
-    Key {
-        field1: 0x02,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: xid,
-    }..Key {
-        field1: 0x02,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: u8::from(overflowed),
-        field6: next_xid,
-    }
-}
-
-//-- Section 03: Control file
-const CONTROLFILE_KEY: Key = Key {
-    field1: 0x03,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 0,
-};
-
-const CHECKPOINT_KEY: Key = Key {
-    field1: 0x03,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 1,
-};
-
-const AUX_FILES_KEY: Key = Key {
-    field1: 0x03,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 2,
-};
-
-// Reverse mappings for a few Keys.
-// These are needed by WAL redo manager.
-
-// AUX_FILES currently stores only data for logical replication (slots etc), and
-// we don't preserve these on a branch because safekeepers can't follow timeline
-// switch (and generally it likely should be optional), so ignore these.
-pub fn is_inherited_key(key: Key) -> bool {
-    key != AUX_FILES_KEY
-}
-
-pub fn is_rel_fsm_block_key(key: Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
-}
-
-pub fn is_rel_vm_block_key(key: Key) -> bool {
-    key.field1 == 0x00
-        && key.field4 != 0
-        && key.field5 == VISIBILITYMAP_FORKNUM
-        && key.field6 != 0xffffffff
-}
-
-pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
-    Ok(match key.field1 {
-        0x01 => {
-            let kind = match key.field2 {
-                0x00 => SlruKind::Clog,
-                0x01 => SlruKind::MultiXactMembers,
-                0x02 => SlruKind::MultiXactOffsets,
-                _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
-            };
-            let segno = key.field4;
-            let blknum = key.field6;
-
-            (kind, segno, blknum)
-        }
-        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
-    })
-}
-
-fn is_slru_block_key(key: Key) -> bool {
-    key.field1 == 0x01                // SLRU-related
-        && key.field3 == 0x00000001   // but not SlruDir
-        && key.field6 != 0xffffffff // and not SlruSegSize
-}
-
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -18,7 +18,7 @@ use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
 use futures::FutureExt;
 use futures::StreamExt;
-use pageserver_api::models::ShardParameters;
+use pageserver_api::models;
 use pageserver_api::models::TimelineState;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::TenantShardId;
@@ -74,6 +74,7 @@ use crate::tenant::config::LocationMode;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
 pub use crate::tenant::remote_timeline_client::index::IndexPart;
+use crate::tenant::remote_timeline_client::remote_initdb_archive_path;
 use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
 use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::storage_layer::DeltaLayer;
@@ -111,7 +112,7 @@ use toml_edit;
 use utils::{
    crashsafe,
    generation::Generation,
-    id::{TenantId, TimelineId},
+    id::TimelineId,
    lsn::{Lsn, RecordLsn},
 };

@@ -370,13 +371,13 @@ impl WalRedoManager {
 pub enum GetTimelineError {
    #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
    NotActive {
-        tenant_id: TenantId,
+        tenant_id: TenantShardId,
        timeline_id: TimelineId,
        state: TimelineState,
    },
    #[error("Timeline {tenant_id}/{timeline_id} was not found")]
    NotFound {
-        tenant_id: TenantId,
+        tenant_id: TenantShardId,
        timeline_id: TimelineId,
    },
 }
@@ -715,6 +716,10 @@ impl Tenant {
                            // stayed in Activating for such a long time that shutdown found it in
                            // that state.
                            tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation");
+                            // Make the tenant broken so that set_stopping will not hang waiting for it to leave
+                            // the Attaching state.  This is an over-reaction (nothing really broke, the tenant is
+                            // just shutting down), but ensures progress.
+                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
                            return Ok(());
                        },
                    )
@@ -1516,10 +1521,6 @@ impl Tenant {
            .map_err(LoadLocalTimelineError::Load)
    }

-    pub(crate) fn tenant_id(&self) -> TenantId {
-        self.tenant_shard_id.tenant_id
-    }
-
    pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
        self.tenant_shard_id
    }
@@ -1535,13 +1536,13 @@ impl Tenant {
        let timeline = timelines_accessor
            .get(&timeline_id)
            .ok_or(GetTimelineError::NotFound {
-                tenant_id: self.tenant_shard_id.tenant_id,
+                tenant_id: self.tenant_shard_id,
                timeline_id,
            })?;

        if active_only && !timeline.is_active() {
            Err(GetTimelineError::NotActive {
-                tenant_id: self.tenant_shard_id.tenant_id,
+                tenant_id: self.tenant_shard_id,
                timeline_id,
                state: timeline.current_state(),
            })
@@ -1880,7 +1881,7 @@ impl Tenant {
        &self,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> anyhow::Result<(), timeline::CompactionError> {
+    ) -> Result<(), timeline::CompactionError> {
        // Don't start doing work during shutdown, or when broken, we do not need those in the logs
        if !self.is_active() {
            return Ok(());
@@ -2325,6 +2326,32 @@ impl Tenant {
            .clone()
    }

+    /// For API access: generate a LocationConfig equivalent to the one that would be used to
+    /// create a Tenant in the same state.  Do not use this in hot paths: it's for relatively
+    /// rare external API calls, like a reconciliation at startup.
+    pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
+        let conf = self.tenant_conf.read().unwrap();
+
+        let location_config_mode = match conf.location.attach_mode {
+            AttachmentMode::Single => models::LocationConfigMode::AttachedSingle,
+            AttachmentMode::Multi => models::LocationConfigMode::AttachedMulti,
+            AttachmentMode::Stale => models::LocationConfigMode::AttachedStale,
+        };
+
+        // We have a pageserver TenantConf, we need the API-facing TenantConfig.
+        let tenant_config: models::TenantConfig = conf.tenant_conf.into();
+
+        models::LocationConfig {
+            mode: location_config_mode,
+            generation: self.generation.into(),
+            secondary_conf: None,
+            shard_number: self.shard_identity.number.0,
+            shard_count: self.shard_identity.count.0,
+            shard_stripe_size: self.shard_identity.stripe_size.0,
+            tenant_conf: tenant_config,
+        }
+    }
+
    pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
        &self.tenant_shard_id
    }
@@ -2570,7 +2597,9 @@ impl Tenant {
        let (state, mut rx) = watch::channel(state);

        tokio::spawn(async move {
+            // Strings for metric labels
            let tid = tenant_shard_id.to_string();
+            let shard_id_str = format!("{}", tenant_shard_id.shard_slug());

            fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
                ([state.into()], matches!(state, TenantState::Broken { .. }))
@@ -2583,13 +2612,15 @@ impl Tenant {
                // the tenant might be ignored and reloaded, so first remove any previous set
                // element. it most likely has already been scraped, as these are manual operations
                // right now. most likely we will add it back very soon.
-                drop(crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid]));
+                drop(
+                    crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid, &shard_id_str]),
+                );
                false
            } else {
                // add the id to the set right away, there should not be any updates on the channel
                // after
                crate::metrics::BROKEN_TENANTS_SET
-                    .with_label_values(&[&tid])
+                    .with_label_values(&[&tid, &shard_id_str])
                    .set(1);
                true
            };
@@ -2615,7 +2646,7 @@ impl Tenant {
                    counted_broken = true;
                    // insert the tenant_id (back) into the set
                    crate::metrics::BROKEN_TENANTS_SET
-                        .with_label_values(&[&tid])
+                        .with_label_values(&[&tid, &shard_id_str])
                        .inc();
                }
            }
@@ -2679,7 +2710,7 @@ impl Tenant {
            Ok(LocationConf::attached_single(
                tenant_conf,
                Generation::none(),
-                &ShardParameters::default(),
+                &models::ShardParameters::default(),
            ))
        } else {
            // FIXME If the config file is not found, assume that we're attaching
@@ -3272,6 +3303,18 @@ impl Tenant {
            let Some(storage) = &self.remote_storage else {
                bail!("no storage configured but load_existing_initdb set to {existing_initdb_timeline_id}");
            };
+            if existing_initdb_timeline_id != timeline_id {
+                let source_path = &remote_initdb_archive_path(
+                    &self.tenant_shard_id.tenant_id,
+                    &existing_initdb_timeline_id,
+                );
+                let dest_path =
+                    &remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id);
+                storage
+                    .copy_object(source_path, dest_path)
+                    .await
+                    .context("copy initdb tar")?;
+            }
            let (initdb_tar_zst_path, initdb_tar_zst) =
                self::remote_timeline_client::download_initdb_tar_zst(
                    self.conf,
@@ -3295,7 +3338,7 @@ impl Tenant {
                .await
                .context("extract initdb tar")?;
        } else {
-            // Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path
+            // Init temporarily repo to get bootstrap data, this creates a directory in the `pgdata_path` path
            run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;

            // Upload the created data dir to S3
@@ -3590,6 +3633,9 @@ impl Tenant {
        self.cached_synthetic_tenant_size
            .store(size, Ordering::Relaxed);

+        // Only shard zero should be calculating synthetic sizes
+        debug_assert!(self.shard_identity.is_zero());
+
        TENANT_SYNTHETIC_SIZE_METRIC
            .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
            .unwrap()
@@ -3741,7 +3787,7 @@ async fn run_initdb(

 impl Drop for Tenant {
    fn drop(&mut self) {
-        remove_tenant_metrics(&self.tenant_shard_id.tenant_id);
+        remove_tenant_metrics(&self.tenant_shard_id);
    }
 }
 /// Dump contents of a layer file to stdout.
@@ -3780,6 +3826,7 @@ pub(crate) mod harness {
    use bytes::{Bytes, BytesMut};
    use camino::Utf8PathBuf;
    use once_cell::sync::OnceCell;
+    use pageserver_api::models::ShardParameters;
    use pageserver_api::shard::ShardIndex;
    use std::fs;
    use std::sync::Arc;
@@ -5168,7 +5215,7 @@ mod tests {
                assert_eq!(
                    e,
                    GetTimelineError::NotFound {
-                        tenant_id: tenant.tenant_shard_id.tenant_id,
+                        tenant_id: tenant.tenant_shard_id,
                        timeline_id: TIMELINE_ID,
                    }
                )
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -9,7 +9,8 @@
 //! may lead to a data loss.
 //!
 use anyhow::bail;
-use pageserver_api::models::{self, ShardParameters};
+use pageserver_api::models;
+use pageserver_api::models::EvictionPolicy;
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
 use serde::{Deserialize, Serialize};
@@ -170,7 +171,7 @@ impl LocationConf {
    pub(crate) fn attached_single(
        tenant_conf: TenantConfOpt,
        generation: Generation,
-        shard_params: &ShardParameters,
+        shard_params: &models::ShardParameters,
    ) -> Self {
        Self {
            mode: LocationMode::Attached(AttachedLocationConfig {
@@ -431,30 +432,6 @@ pub struct TenantConfOpt {
    pub heatmap_period: Option<Duration>,
 }

-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(tag = "kind")]
-pub enum EvictionPolicy {
-    NoEviction,
-    LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
-}
-
-impl EvictionPolicy {
-    pub fn discriminant_str(&self) -> &'static str {
-        match self {
-            EvictionPolicy::NoEviction => "NoEviction",
-            EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
-        }
-    }
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-pub struct EvictionPolicyLayerAccessThreshold {
-    #[serde(with = "humantime_serde")]
-    pub period: Duration,
-    #[serde(with = "humantime_serde")]
-    pub threshold: Duration,
-}
-
 impl TenantConfOpt {
    pub fn merge(&self, global_conf: TenantConf) -> TenantConf {
        TenantConf {
@@ -579,6 +556,38 @@ impl TryFrom<toml_edit::Item> for TenantConfOpt {
    }
 }

+/// This is a conversion from our internal tenant config object to the one used
+/// in external APIs.
+impl From<TenantConfOpt> for models::TenantConfig {
+    fn from(value: TenantConfOpt) -> Self {
+        fn humantime(d: Duration) -> String {
+            format!("{}s", d.as_secs())
+        }
+        Self {
+            checkpoint_distance: value.checkpoint_distance,
+            checkpoint_timeout: value.checkpoint_timeout.map(humantime),
+            compaction_target_size: value.compaction_target_size,
+            compaction_period: value.compaction_period.map(humantime),
+            compaction_threshold: value.compaction_threshold,
+            gc_horizon: value.gc_horizon,
+            gc_period: value.gc_period.map(humantime),
+            image_creation_threshold: value.image_creation_threshold,
+            pitr_interval: value.pitr_interval.map(humantime),
+            walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime),
+            lagging_wal_timeout: value.lagging_wal_timeout.map(humantime),
+            max_lsn_wal_lag: value.max_lsn_wal_lag,
+            trace_read_requests: value.trace_read_requests,
+            eviction_policy: value.eviction_policy,
+            min_resident_size_override: value.min_resident_size_override,
+            evictions_low_residence_duration_metric_threshold: value
+                .evictions_low_residence_duration_metric_threshold
+                .map(humantime),
+            gc_feedback: value.gc_feedback,
+            heatmap_period: value.heatmap_period.map(humantime),
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -619,8 +619,8 @@ impl LayerMap {
    }

    /// Return all L0 delta layers
-    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<PersistentLayerDesc>>> {
-        Ok(self.l0_delta_layers.to_vec())
+    pub fn get_level0_deltas(&self) -> Vec<Arc<PersistentLayerDesc>> {
+        self.l0_delta_layers.to_vec()
    }

    /// debugging function to print out the contents of the layer map
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -57,6 +57,7 @@ use super::TenantSharedResources;
 /// that way we avoid having to carefully switch a tenant's ingestion etc on and off during
 /// its lifetime, and we can preserve some important safety invariants like `Tenant` always
 /// having a properly acquired generation (Secondary doesn't need a generation)
+#[derive(Clone)]
 pub(crate) enum TenantSlot {
    Attached(Arc<Tenant>),
    Secondary(Arc<SecondaryTenant>),
@@ -477,6 +478,8 @@ pub async fn init_tenant_mgr(
                            tenant_shard_id,
                            TenantSlot::Secondary(SecondaryTenant::new(
                                tenant_shard_id,
+                                location_conf.shard,
+                                location_conf.tenant_conf,
                                secondary_config,
                            )),
                        );
@@ -844,15 +847,13 @@ impl TenantManager {
                TenantState::Active => Ok(Arc::clone(tenant)),
                _ => {
                    if active_only {
-                        Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
+                        Err(GetTenantError::NotActive(tenant_shard_id))
                    } else {
                        Ok(Arc::clone(tenant))
                    }
                }
            },
-            Some(TenantSlot::InProgress(_)) => {
-                Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
-            }
+            Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
            None | Some(TenantSlot::Secondary(_)) => {
                Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
            }
@@ -921,6 +922,7 @@ impl TenantManager {
                    Some(TenantSlot::Secondary(secondary_tenant)),
                ) => {
                    secondary_tenant.set_config(secondary_conf);
+                    secondary_tenant.set_tenant_conf(&new_location_config.tenant_conf);
                    Some(FastPathModified::Secondary(secondary_tenant.clone()))
                }
                _ => {
@@ -1053,16 +1055,36 @@ impl TenantManager {

        let new_slot = match &new_location_config.mode {
            LocationMode::Secondary(secondary_config) => {
-                TenantSlot::Secondary(SecondaryTenant::new(tenant_shard_id, secondary_config))
+                let shard_identity = new_location_config.shard;
+                TenantSlot::Secondary(SecondaryTenant::new(
+                    tenant_shard_id,
+                    shard_identity,
+                    new_location_config.tenant_conf,
+                    secondary_config,
+                ))
            }
            LocationMode::Attached(_attach_config) => {
                let shard_identity = new_location_config.shard;
+
+                // Testing hack: if we are configured with no control plane, then drop the generation
+                // from upserts.  This enables creating generation-less tenants even though neon_local
+                // always uses generations when calling the location conf API.
+                let attached_conf = if cfg!(feature = "testing") {
+                    let mut conf = AttachedTenantConf::try_from(new_location_config)?;
+                    if self.conf.control_plane_api.is_none() {
+                        conf.location.generation = Generation::none();
+                    }
+                    conf
+                } else {
+                    AttachedTenantConf::try_from(new_location_config)?
+                };
+
                let tenant = tenant_spawn(
                    self.conf,
                    tenant_shard_id,
                    &tenant_path,
                    self.resources.clone(),
-                    AttachedTenantConf::try_from(new_location_config)?,
+                    attached_conf,
                    shard_identity,
                    None,
                    self.tenants,
@@ -1203,6 +1225,17 @@ impl TenantManager {
        }
    }

+    /// Total list of all tenant slots: this includes attached, secondary, and InProgress.
+    pub(crate) fn list(&self) -> Vec<(TenantShardId, TenantSlot)> {
+        let locked = self.tenants.read().unwrap();
+        match &*locked {
+            TenantsMap::Initializing => Vec::new(),
+            TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => {
+                map.iter().map(|(k, v)| (*k, v.clone())).collect()
+            }
+        }
+    }
+
    pub(crate) async fn delete_tenant(
        &self,
        tenant_shard_id: TenantShardId,
@@ -1271,10 +1304,13 @@ impl TenantManager {

 #[derive(Debug, thiserror::Error)]
 pub(crate) enum GetTenantError {
+    /// NotFound is a TenantId rather than TenantShardId, because this error type is used from
+    /// getters that use a TenantId and a ShardSelector, not just getters that target a specific shard.
    #[error("Tenant {0} not found")]
    NotFound(TenantId),
+
    #[error("Tenant {0} is not active")]
-    NotActive(TenantId),
+    NotActive(TenantShardId),
    /// Broken is logically a subset of NotActive, but a distinct error is useful as
    /// NotActive is usually a retryable state for API purposes, whereas Broken
    /// is a stuck error state
@@ -1307,15 +1343,13 @@ pub(crate) fn get_tenant(
            TenantState::Active => Ok(Arc::clone(tenant)),
            _ => {
                if active_only {
-                    Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
+                    Err(GetTenantError::NotActive(tenant_shard_id))
                } else {
                    Ok(Arc::clone(tenant))
                }
            }
        },
-        Some(TenantSlot::InProgress(_)) => {
-            Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
-        }
+        Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
        None | Some(TenantSlot::Secondary(_)) => {
            Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
        }
@@ -1391,7 +1425,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
            }
            Some(TenantSlot::Secondary(_)) => {
                return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
-                    tenant_id,
+                    tenant_shard_id,
                )))
            }
            Some(TenantSlot::InProgress(barrier)) => {
@@ -1430,7 +1464,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
                    Some(TenantSlot::Attached(tenant)) => tenant.clone(),
                    _ => {
                        return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
-                            tenant_id,
+                            tenant_shard_id,
                        )))
                    }
                }
@@ -1458,7 +1492,7 @@ pub(crate) enum DeleteTimelineError {
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum TenantStateError {
    #[error("Tenant {0} is stopping")]
-    IsStopping(TenantId),
+    IsStopping(TenantShardId),
    #[error(transparent)]
    SlotError(#[from] TenantSlotError),
    #[error(transparent)]
@@ -2088,7 +2122,7 @@ where
                    // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
                    // wait for it but return an error right away because these are distinct requests.
                    slot_guard.revert();
-                    return Err(TenantStateError::IsStopping(tenant_shard_id.tenant_id));
+                    return Err(TenantStateError::IsStopping(tenant_shard_id));
                }
            }
            Some(tenant)
@@ -2217,7 +2251,6 @@ pub(crate) async fn immediate_gc(

 #[cfg(test)]
 mod tests {
-    use pageserver_api::shard::TenantShardId;
    use std::collections::BTreeMap;
    use std::sync::Arc;
    use tracing::{info_span, Instrument};
@@ -2238,7 +2271,7 @@ mod tests {

        // harness loads it to active, which is forced and nothing is running on the tenant

-        let id = TenantShardId::unsharded(t.tenant_id());
+        let id = t.tenant_shard_id();

        // tenant harness configures the logging and we cannot escape it
        let _e = info_span!("testing", tenant_id = %id).entered();
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -182,7 +182,7 @@

 pub(crate) mod download;
 pub mod index;
-mod upload;
+pub(crate) mod upload;

 use anyhow::Context;
 use camino::Utf8Path;
@@ -237,7 +237,7 @@ use utils::id::{TenantId, TimelineId};
 use self::index::IndexPart;

 use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
-use super::upload_queue::SetDeletedFlagProgress;
+use super::upload_queue::{self, SetDeletedFlagProgress};
 use super::Generation;

 pub(crate) use download::{is_temp_download_file, list_remote_timelines};
@@ -522,8 +522,6 @@ impl RemoteTimelineClient {
            cancel,
        )
        .measure_remote_op(
-            self.tenant_shard_id.tenant_id,
-            self.timeline_id,
            RemoteOpFileKind::Index,
            RemoteOpKind::Download,
            Arc::clone(&self.metrics),
@@ -566,8 +564,6 @@ impl RemoteTimelineClient {
                cancel,
            )
            .measure_remote_op(
-                self.tenant_shard_id.tenant_id,
-                self.timeline_id,
                RemoteOpFileKind::Layer,
                RemoteOpKind::Download,
                Arc::clone(&self.metrics),
@@ -625,7 +621,9 @@ impl RemoteTimelineClient {
    ///
    /// Like schedule_index_upload_for_metadata_update(), this merely adds
    /// the upload to the upload queue and returns quickly.
-    pub fn schedule_index_upload_for_file_changes(self: &Arc<Self>) -> anyhow::Result<()> {
+    pub(crate) fn schedule_index_upload_for_file_changes(
+        self: &Arc<Self>,
+    ) -> Result<(), upload_queue::NotInitialized> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

@@ -670,7 +668,7 @@ impl RemoteTimelineClient {
    pub(crate) fn schedule_layer_file_upload(
        self: &Arc<Self>,
        layer: ResidentLayer,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), upload_queue::NotInitialized> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

@@ -691,7 +689,10 @@ impl RemoteTimelineClient {
            .insert(layer.layer_desc().filename(), metadata.clone());
        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;

-        info!("scheduled layer file upload {layer}");
+        info!(
+            "scheduled layer file upload {layer} gen={:?} shard={:?}",
+            metadata.generation, metadata.shard
+        );
        let op = UploadOp::UploadLayer(layer, metadata);
        self.calls_unfinished_metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
@@ -876,7 +877,7 @@ impl RemoteTimelineClient {
        self: &Arc<Self>,
        compacted_from: &[Layer],
        compacted_to: &[ResidentLayer],
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), upload_queue::NotInitialized> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

@@ -1348,8 +1349,6 @@ impl RemoteTimelineClient {
                        &self.cancel,
                    )
                    .measure_remote_op(
-                        self.tenant_shard_id.tenant_id,
-                        self.timeline_id,
                        RemoteOpFileKind::Layer,
                        RemoteOpKind::Upload,
                        Arc::clone(&self.metrics),
@@ -1375,8 +1374,6 @@ impl RemoteTimelineClient {
                        &self.cancel,
                    )
                    .measure_remote_op(
-                        self.tenant_shard_id.tenant_id,
-                        self.timeline_id,
                        RemoteOpFileKind::Index,
                        RemoteOpKind::Upload,
                        Arc::clone(&self.metrics),
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -18,11 +18,16 @@ use self::{
 };

 use super::{
-    config::SecondaryLocationConfig, mgr::TenantManager,
-    span::debug_assert_current_span_has_tenant_id, storage_layer::LayerFileName,
+    config::{SecondaryLocationConfig, TenantConfOpt},
+    mgr::TenantManager,
+    span::debug_assert_current_span_has_tenant_id,
+    storage_layer::LayerFileName,
 };

-use pageserver_api::shard::TenantShardId;
+use pageserver_api::{
+    models,
+    shard::{ShardIdentity, TenantShardId},
+};
 use remote_storage::GenericRemoteStorage;

 use tokio_util::sync::CancellationToken;
@@ -84,12 +89,20 @@ pub(crate) struct SecondaryTenant {

    pub(crate) gate: Gate,

+    // Secondary mode does not need the full shard identity or the TenantConfOpt.  However,
+    // storing these enables us to report our full LocationConf, enabling convenient reconciliation
+    // by the control plane (see [`Self::get_location_conf`])
+    shard_identity: ShardIdentity,
+    tenant_conf: std::sync::Mutex<TenantConfOpt>,
+
    detail: std::sync::Mutex<SecondaryDetail>,
 }

 impl SecondaryTenant {
    pub(crate) fn new(
        tenant_shard_id: TenantShardId,
+        shard_identity: ShardIdentity,
+        tenant_conf: TenantConfOpt,
        config: &SecondaryLocationConfig,
    ) -> Arc<Self> {
        Arc::new(Self {
@@ -101,6 +114,9 @@ impl SecondaryTenant {
            cancel: CancellationToken::new(),
            gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")),

+            shard_identity,
+            tenant_conf: std::sync::Mutex::new(tenant_conf),
+
            detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
        })
    }
@@ -116,6 +132,30 @@ impl SecondaryTenant {
        self.detail.lock().unwrap().config = config.clone();
    }

+    pub(crate) fn set_tenant_conf(&self, config: &TenantConfOpt) {
+        *(self.tenant_conf.lock().unwrap()) = *config;
+    }
+
+    /// For API access: generate a LocationConfig equivalent to the one that would be used to
+    /// create a Tenant in the same state.  Do not use this in hot paths: it's for relatively
+    /// rare external API calls, like a reconciliation at startup.
+    pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
+        let conf = self.detail.lock().unwrap().config.clone();
+
+        let conf = models::LocationConfigSecondary { warm: conf.warm };
+
+        let tenant_conf = *self.tenant_conf.lock().unwrap();
+        models::LocationConfig {
+            mode: models::LocationConfigMode::Secondary,
+            generation: None,
+            secondary_conf: Some(conf),
+            shard_number: self.tenant_shard_id.shard_number.0,
+            shard_count: self.tenant_shard_id.shard_count.0,
+            shard_stripe_size: self.shard_identity.stripe_size.0,
+            tenant_conf: tenant_conf.into(),
+        }
+    }
+
    pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
        &self.tenant_shard_id
    }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -290,7 +290,7 @@ impl Layer {
    }

    /// Downloads if necessary and creates a guard, which will keep this layer from being evicted.
-    pub(crate) async fn download_and_keep_resident(&self) -> anyhow::Result<ResidentLayer> {
+    pub(crate) async fn download_and_keep_resident(&self) -> Result<ResidentLayer, DownloadError> {
        let downloaded = self.0.get_or_maybe_download(true, None).await?;

        Ok(ResidentLayer {
@@ -1174,7 +1174,7 @@ pub(crate) enum EvictionError {

 /// Error internal to the [`LayerInner::get_or_maybe_download`]
 #[derive(Debug, thiserror::Error)]
-enum DownloadError {
+pub(crate) enum DownloadError {
    #[error("timeline has already shutdown")]
    TimelineShutdown,
    #[error("no remote storage configured")]
@@ -1197,6 +1197,15 @@ enum DownloadError {
    PostStatFailed(#[source] std::io::Error),
 }

+impl DownloadError {
+    pub fn is_cancelled(&self) -> bool {
+        match self {
+            Self::TimelineShutdown | Self::DownloadCancelled => true,
+            _ => false,
+        }
+    }
+}
+
 #[derive(Debug, PartialEq)]
 pub(crate) enum NeedsDownload {
    NotFound,
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -9,6 +9,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
+use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -181,8 +182,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                    );
                    error_run_count += 1;
                    let wait_duration = Duration::from_secs_f64(wait_duration);
-                    error!(
-                        "Compaction failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
+                    log_compaction_error(
+                        &e,
+                        error_run_count,
+                        &wait_duration,
+                        cancel.is_cancelled(),
                    );
                    wait_duration
                } else {
@@ -210,6 +214,38 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
 }

+fn log_compaction_error(
+    e: &CompactionError,
+    error_run_count: u32,
+    sleep_duration: &std::time::Duration,
+    task_cancelled: bool,
+) {
+    use crate::tenant::upload_queue::NotInitialized;
+    use crate::tenant::PageReconstructError;
+    use CompactionError::*;
+
+    enum LooksLike {
+        Info,
+        Error,
+    }
+
+    let decision = match e {
+        ShuttingDown => None,
+        _ if task_cancelled => Some(LooksLike::Info),
+        Other(e) => Some(LooksLike::Error),
+    };
+
+    match decision {
+        Some(LooksLike::Info) => info!(
+            "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}",
+        ),
+        Some(LooksLike::Error) => error!(
+            "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}",
+        ),
+        None => {}
+    }
+}
+
 ///
 /// GC task's main loop
 ///
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -15,9 +15,10 @@ use fail::fail_point;
 use itertools::Itertools;
 use pageserver_api::{
    models::{
-        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, LayerMapInfo,
-        TimelineState,
+        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
+        LayerMapInfo, TimelineState,
    },
+    reltag::BlockNumber,
    shard::{ShardIdentity, TenantShardId},
 };
 use rand::Rng;
@@ -42,7 +43,6 @@ use std::{
    ops::ControlFlow,
 };

-use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
    layer_map::{LayerMap, SearchResult},
@@ -65,16 +65,17 @@ use crate::{
 use crate::{
    disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
 };
+use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};

 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
 use crate::metrics::{
    TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
 };
-use crate::pgdatadir_mapping::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
-use crate::pgdatadir_mapping::{CalculateLogicalSizeError, LsnForTimestamp};
-use crate::tenant::config::{EvictionPolicy, TenantConfOpt};
-use pageserver_api::reltag::{BlockNumber, RelTag};
+use crate::pgdatadir_mapping::CalculateLogicalSizeError;
+use crate::tenant::config::TenantConfOpt;
+use pageserver_api::key::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
+use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::ShardIndex;

 use postgres_connection::PgConnectionConfig;
@@ -102,11 +103,14 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

-use super::config::TenantConf;
-use super::remote_timeline_client::index::{IndexLayerMetadata, IndexPart};
 use super::remote_timeline_client::RemoteTimelineClient;
 use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
+use super::{config::TenantConf, upload_queue::NotInitialized};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
+use super::{
+    remote_timeline_client::index::{IndexLayerMetadata, IndexPart},
+    storage_layer::layer,
+};

 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(super) enum FlushLoopState {
@@ -251,6 +255,10 @@ pub struct Timeline {

    pub(super) metrics: TimelineMetrics,

+    // `Timeline` doesn't write these metrics itself, but it manages the lifetime.  Code
+    // in `crate::page_service` writes these metrics.
+    pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline,
+
    /// Ensures layers aren't frozen by checkpointer between
    /// [`Timeline::get_layer_for_write`] and layer reads.
    /// Locked automatically by [`TimelineWriter`] and checkpointer.
@@ -386,8 +394,7 @@ pub(crate) enum PageReconstructError {
    #[error("Ancestor LSN wait error: {0}")]
    AncestorLsnTimeout(#[from] WaitLsnError),

-    /// The operation was cancelled
-    #[error("Cancelled")]
+    #[error("timeline shutting down")]
    Cancelled,

    /// The ancestor of this is being stopped
@@ -399,6 +406,19 @@ pub(crate) enum PageReconstructError {
    WalRedo(anyhow::Error),
 }

+impl PageReconstructError {
+    /// Returns true if this error indicates a tenant/timeline shutdown alike situation
+    pub(crate) fn is_stopping(&self) -> bool {
+        use PageReconstructError::*;
+        match self {
+            Other(_) => false,
+            AncestorLsnTimeout(_) => false,
+            Cancelled | AncestorStopping(_) => true,
+            WalRedo(_) => false,
+        }
+    }
+}
+
 #[derive(thiserror::Error, Debug)]
 enum FlushLayerError {
    /// Timeline cancellation token was cancelled
@@ -827,8 +847,7 @@ impl Timeline {
                // "enough".
                let layers = self
                    .create_image_layers(&partitioning, lsn, false, &image_ctx)
-                    .await
-                    .map_err(anyhow::Error::from)?;
+                    .await?;
                if let Some(remote_client) = &self.remote_client {
                    for layer in layers {
                        remote_client.schedule_layer_file_upload(layer)?;
@@ -1314,6 +1333,11 @@ impl Timeline {
                    ),
                ),

+                query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new(
+                    &tenant_shard_id,
+                    &timeline_id,
+                ),
+
                flush_loop_state: Mutex::new(FlushLoopState::NotStarted),

                layer_flush_start_tx,
@@ -3190,7 +3214,46 @@ pub(crate) enum CompactionError {
    ShuttingDown,
    /// Compaction cannot be done right now; page reconstruction and so on.
    #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    Other(anyhow::Error),
+}
+
+impl CompactionError {
+    fn other<E>(err: E) -> Self
+    where
+        E: std::error::Error + Send + Sync + 'static,
+    {
+        CompactionError::Other(anyhow::Error::new(err))
+    }
+}
+
+impl From<PageReconstructError> for CompactionError {
+    fn from(value: PageReconstructError) -> Self {
+        if value.is_stopping() {
+            CompactionError::ShuttingDown
+        } else {
+            CompactionError::other(value)
+        }
+    }
+}
+
+impl From<NotInitialized> for CompactionError {
+    fn from(value: NotInitialized) -> Self {
+        if value.is_stopping() {
+            CompactionError::ShuttingDown
+        } else {
+            CompactionError::other(value)
+        }
+    }
+}
+
+impl From<layer::DownloadError> for CompactionError {
+    fn from(value: layer::DownloadError) -> Self {
+        if value.is_cancelled() {
+            CompactionError::ShuttingDown
+        } else {
+            CompactionError::other(value)
+        }
+    }
 }

 #[serde_as]
@@ -3323,7 +3386,7 @@ impl Timeline {
        stats.read_lock_held_spawn_blocking_startup_micros =
            stats.read_lock_acquisition_micros.till_now(); // set by caller
        let layers = guard.layer_map();
-        let level0_deltas = layers.get_level0_deltas()?;
+        let level0_deltas = layers.get_level0_deltas();
        let mut level0_deltas = level0_deltas
            .into_iter()
            .map(|x| guard.get_from_desc(&x))
@@ -3370,7 +3433,8 @@ impl Timeline {
                        delta
                            .download_and_keep_resident()
                            .await
-                            .context("download layer for failpoint")?,
+                            .context("download layer for failpoint")
+                            .map_err(CompactionError::Other)?,
                    );
                }
                tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint
@@ -3454,7 +3518,7 @@ impl Timeline {
        let mut all_keys = Vec::new();

        for l in deltas_to_compact.iter() {
-            all_keys.extend(l.load_keys(ctx).await?);
+            all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
        }

        // FIXME: should spawn_blocking the rest of this function
@@ -3474,7 +3538,10 @@ impl Timeline {
                    // has not so much sense, because largest holes will corresponds field1/field2 changes.
                    // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
                    // That is why it is better to measure size of hole as number of covering image layers.
-                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len();
+                    let coverage_size = layers
+                        .image_coverage(&key_range, last_record_lsn)
+                        .map_err(CompactionError::Other)?
+                        .len();
                    if coverage_size >= min_hole_coverage_size {
                        heap.push(Hole {
                            key_range,
@@ -3573,7 +3640,7 @@ impl Timeline {
            key, lsn, ref val, ..
        } in all_values_iter
        {
-            let value = val.load(ctx).await?;
+            let value = val.load(ctx).await.map_err(CompactionError::Other)?;
            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
            // We need to check key boundaries once we reach next key or end of layer with the same key
            if !same_key || lsn == dup_end_lsn {
@@ -3630,7 +3697,8 @@ impl Timeline {
                                .take()
                                .unwrap()
                                .finish(prev_key.unwrap().next(), self)
-                                .await?,
+                                .await
+                                .map_err(CompactionError::Other)?,
                        );
                        writer = None;

@@ -3660,7 +3728,8 @@ impl Timeline {
                            lsn_range.clone()
                        },
                    )
-                    .await?,
+                    .await
+                    .map_err(CompactionError::Other)?,
                );
            }

@@ -3671,7 +3740,12 @@ impl Timeline {
            });

            if !self.shard_identity.is_key_disposable(&key) {
-                writer.as_mut().unwrap().put_value(key, lsn, value).await?;
+                writer
+                    .as_mut()
+                    .unwrap()
+                    .put_value(key, lsn, value)
+                    .await
+                    .map_err(CompactionError::Other)?;
            } else {
                debug!(
                    "Dropping key {} during compaction (it belongs on shard {:?})",
@@ -3687,7 +3761,12 @@ impl Timeline {
            prev_key = Some(key);
        }
        if let Some(writer) = writer {
-            new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?);
+            new_layers.push(
+                writer
+                    .finish(prev_key.unwrap().next(), self)
+                    .await
+                    .map_err(CompactionError::Other)?,
+            );
        }

        // Sync layers
@@ -3716,7 +3795,8 @@ impl Timeline {
            // minimize latency.
            par_fsync::par_fsync_async(&layer_paths)
                .await
-                .context("fsync all new layers")?;
+                .context("fsync all new layers")
+                .map_err(CompactionError::Other)?;

            let timeline_dir = self
                .conf
@@ -3724,7 +3804,8 @@ impl Timeline {

            par_fsync::par_fsync_async(&[timeline_dir])
                .await
-                .context("fsync of timeline dir")?;
+                .context("fsync of timeline dir")
+                .map_err(CompactionError::Other)?;
        }

        stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -20,6 +20,7 @@ use std::{
    time::{Duration, SystemTime},
 };

+use pageserver_api::models::{EvictionPolicy, EvictionPolicyLayerAccessThreshold};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
@@ -29,10 +30,7 @@ use crate::{
    pgdatadir_mapping::CollectKeySpaceError,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
-        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
-        tasks::BackgroundLoopKind,
-        timeline::EvictionError,
-        LogicalSizeCalculationCause, Tenant,
+        tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
    },
 };

--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -126,6 +126,27 @@ pub(super) struct UploadQueueStopped {
    pub(super) deleted_at: SetDeletedFlagProgress,
 }

+#[derive(thiserror::Error, Debug)]
+pub(crate) enum NotInitialized {
+    #[error("queue is in state Uninitialized")]
+    Uninitialized,
+    #[error("queue is in state Stopping")]
+    Stopped,
+    #[error("queue is shutting down")]
+    ShuttingDown,
+}
+
+impl NotInitialized {
+    pub(crate) fn is_stopping(&self) -> bool {
+        use NotInitialized::*;
+        match self {
+            Uninitialized => false,
+            Stopped => true,
+            ShuttingDown => true,
+        }
+    }
+}
+
 impl UploadQueue {
    pub(crate) fn initialize_empty_remote(
        &mut self,
@@ -213,18 +234,20 @@ impl UploadQueue {
        Ok(self.initialized_mut().expect("we just set it"))
    }

-    pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
+    pub(crate) fn initialized_mut(
+        &mut self,
+    ) -> Result<&mut UploadQueueInitialized, NotInitialized> {
+        use UploadQueue::*;
        match self {
-            UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
-                anyhow::bail!("queue is in state {}", self.as_str())
-            }
-            UploadQueue::Initialized(x) => {
-                if !x.shutting_down {
-                    Ok(x)
+            Uninitialized => Err(NotInitialized::Uninitialized.into()),
+            Initialized(x) => {
+                if x.shutting_down {
+                    Err(NotInitialized::ShuttingDown.into())
                } else {
-                    anyhow::bail!("queue is shutting down")
+                    Ok(x)
                }
            }
+            Stopped(_) => Err(NotInitialized::Stopped.into()),
        }
    }

--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -14,6 +14,7 @@ use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC
 use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
+use pageserver_api::shard::TenantShardId;
 use std::fs::{self, File, OpenOptions};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
 use std::os::unix::fs::FileExt;
@@ -60,6 +61,7 @@ pub struct VirtualFile {
    // It makes no sense for us to constantly turn the `TimelineId` and `TenantId` into
    // strings.
    tenant_id: String,
+    shard_id: String,
    timeline_id: String,
 }

@@ -301,15 +303,24 @@ impl VirtualFile {
    ) -> Result<VirtualFile, std::io::Error> {
        let path_str = path.to_string();
        let parts = path_str.split('/').collect::<Vec<&str>>();
-        let tenant_id;
-        let timeline_id;
-        if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME {
-            tenant_id = parts[parts.len() - 4].to_string();
-            timeline_id = parts[parts.len() - 2].to_string();
-        } else {
-            tenant_id = "*".to_string();
-            timeline_id = "*".to_string();
-        }
+        let (tenant_id, shard_id, timeline_id) =
+            if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME {
+                let tenant_shard_part = parts[parts.len() - 4];
+                let (tenant_id, shard_id) = match tenant_shard_part.parse::<TenantShardId>() {
+                    Ok(tenant_shard_id) => (
+                        tenant_shard_id.tenant_id.to_string(),
+                        format!("{}", tenant_shard_id.shard_slug()),
+                    ),
+                    Err(_) => {
+                        // Malformed path: this ID is just for observability, so tolerate it
+                        // and pass through
+                        (tenant_shard_part.to_string(), "*".to_string())
+                    }
+                };
+                (tenant_id, shard_id, parts[parts.len() - 2].to_string())
+            } else {
+                ("*".to_string(), "*".to_string(), "*".to_string())
+            };
        let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;

        // NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
@@ -333,6 +344,7 @@ impl VirtualFile {
            path: path.to_path_buf(),
            open_options: reopen_options,
            tenant_id,
+            shard_id,
            timeline_id,
        };

@@ -574,7 +586,7 @@ impl VirtualFile {
            .read_at(buf, offset));
        if let Ok(size) = result {
            STORAGE_IO_SIZE
-                .with_label_values(&["read", &self.tenant_id, &self.timeline_id])
+                .with_label_values(&["read", &self.tenant_id, &self.shard_id, &self.timeline_id])
                .add(size as i64);
        }
        result
@@ -586,7 +598,7 @@ impl VirtualFile {
            .write_at(buf, offset));
        if let Ok(size) = result {
            STORAGE_IO_SIZE
-                .with_label_values(&["write", &self.tenant_id, &self.timeline_id])
+                .with_label_values(&["write", &self.tenant_id, &self.shard_id, &self.timeline_id])
                .add(size as i64);
        }
        result
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -33,11 +33,12 @@ use utils::failpoint_support;

 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
-use crate::pgdatadir_mapping::*;
+use crate::pgdatadir_mapping::{DatadirModification, Version};
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::walrecord::*;
 use crate::ZERO_PAGE;
+use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
@@ -102,7 +103,9 @@ impl WalIngest {
        buf.advance(decoded.main_data_offset);

        assert!(!self.checkpoint_modified);
-        if self.checkpoint.update_next_xid(decoded.xl_xid) {
+        if decoded.xl_xid != pg_constants::INVALID_TRANSACTION_ID
+            && self.checkpoint.update_next_xid(decoded.xl_xid)
+        {
            self.checkpoint_modified = true;
        }

@@ -330,8 +333,13 @@ impl WalIngest {
                        < 0
                    {
                        self.checkpoint.oldestXid = xlog_checkpoint.oldestXid;
-                        self.checkpoint_modified = true;
                    }
+
+                    // Write a new checkpoint key-value pair on every checkpoint record, even
+                    // if nothing really changed. Not strictly required, but it seems nice to
+                    // have some trace of the checkpoint records in the layer files at the same
+                    // LSNs.
+                    self.checkpoint_modified = true;
                }
            }
            pg_constants::RM_LOGICALMSG_ID => {
@@ -2201,7 +2209,8 @@ mod tests {
        let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
        let (tenant, ctx) = harness.load().await;

-        let remote_initdb_path = remote_initdb_archive_path(&tenant.tenant_id(), &TIMELINE_ID);
+        let remote_initdb_path =
+            remote_initdb_archive_path(&tenant.tenant_shard_id().tenant_id, &TIMELINE_ID);
        let initdb_path = harness.remote_fs_dir.join(remote_initdb_path.get_path());

        std::fs::create_dir_all(initdb_path.parent().unwrap())
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -47,11 +47,10 @@ use crate::metrics::{
    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM,
    WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
 };
-use crate::pgdatadir_mapping::key_to_slru_block;
 use crate::repository::Key;
 use crate::walrecord::NeonWalRecord;

-use pageserver_api::key::key_to_rel_block;
+use pageserver_api::key::{key_to_rel_block, key_to_slru_block};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -637,7 +637,7 @@ HandleAlterRole(AlterRoleStmt *stmt)
 	ListCell   *option;
 	const char *role_name = stmt->role->rolename;

-	if (RoleIsNeonSuperuser(role_name))
+	if (RoleIsNeonSuperuser(role_name) && !superuser())
 		elog(ERROR, "can't ALTER neon_superuser");

 	foreach(option, stmt->options)
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -308,13 +308,13 @@ lfc_change_limit_hook(int newval, void *extra)
 		Assert(victim->access_count == 0);
 #ifdef FALLOC_FL_PUNCH_HOLE
 		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
-			elog(LOG, "Failed to punch hole in file: %m");
+			neon_log(LOG, "Failed to punch hole in file: %m");
 #endif
 		hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
 		lfc_ctl->used -= 1;
 	}
 	lfc_ctl->limit = new_size;
-	elog(DEBUG1, "set local file cache limit to %d", new_size);
+	neon_log(DEBUG1, "set local file cache limit to %d", new_size);

 	LWLockRelease(lfc_lock);
 }
@@ -327,7 +327,7 @@ lfc_init(void)
 	 * shared_preload_libraries.
 	 */
 	if (!process_shared_preload_libraries_in_progress)
-		elog(ERROR, "Neon module should be loaded via shared_preload_libraries");
+		neon_log(ERROR, "Neon module should be loaded via shared_preload_libraries");


 	DefineCustomIntVariable("neon.max_file_cache_size",
@@ -643,7 +643,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 			Assert(victim->access_count == 0);
 			entry->offset = victim->offset; /* grab victim's chunk */
 			hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
-			elog(DEBUG2, "Swap file cache page");
+			neon_log(DEBUG2, "Swap file cache page");
 		}
 		else
 		{
@@ -846,10 +846,10 @@ local_cache_pages(PG_FUNCTION_ARGS)
 		 * wrong) function definition though.
 		 */
 		if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
-			elog(ERROR, "return type must be a row type");
+			neon_log(ERROR, "return type must be a row type");

 		if (expected_tupledesc->natts != NUM_LOCALCACHE_PAGES_ELEM)
-			elog(ERROR, "incorrect number of output arguments");
+			neon_log(ERROR, "incorrect number of output arguments");

 		/* Construct a tuple descriptor for the result rows. */
 		tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -64,10 +64,26 @@ static int max_reconnect_attempts = 60;

 #define MAX_PAGESERVER_CONNSTRING_SIZE 256

+/*
+ * The "neon.pageserver_connstring" GUC is marked with the PGC_SIGHUP option,
+ * allowing it to be changed using pg_reload_conf(). The control plane can
+ * update the connection string if the pageserver crashes, is relocated, or
+ * new shards are added. A copy of the current value of the GUC is kept in
+ * shared memory, updated by the postmaster, because regular backends don't
+ * reload the config during query execution, but we might need to re-establish
+ * the pageserver connection with the new connection string even in the middle
+ * of a query.
+ *
+ * The shared memory copy is protected by a lockless algorithm using two
+ * atomic counters. The counters allow a backend to quickly check if the value
+ * has changed since last access, and to detect and retry copying the value if
+ * the postmaster changes the value concurrently. (Postmaster doesn't have a
+ * PGPROC entry and therefore cannot use LWLocks.)
+ */
 typedef struct
 {
-	LWLockId	lock;
-	pg_atomic_uint64 update_counter;
+	pg_atomic_uint64 begin_update_counter;
+	pg_atomic_uint64 end_update_counter;
 	char		pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
 } PagestoreShmemState;

@@ -84,7 +100,7 @@ static bool pageserver_flush(void);
 static void pageserver_disconnect(void);

 static bool
-PagestoreShmemIsValid()
+PagestoreShmemIsValid(void)
 {
 	return pagestore_shared && UsedShmemSegAddr;
 }
@@ -98,31 +114,58 @@ CheckPageserverConnstring(char **newval, void **extra, GucSource source)
 static void
 AssignPageserverConnstring(const char *newval, void *extra)
 {
-	if (!PagestoreShmemIsValid())
+	/*
+	 * Only postmaster updates the copy in shared memory.
+	 */
+	if (!PagestoreShmemIsValid() || IsUnderPostmaster)
 		return;
-	LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
+
+	pg_atomic_add_fetch_u64(&pagestore_shared->begin_update_counter, 1);
+	pg_write_barrier();
 	strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
-	pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
-	LWLockRelease(pagestore_shared->lock);
+	pg_write_barrier();
+	pg_atomic_add_fetch_u64(&pagestore_shared->end_update_counter, 1);
 }

 static bool
-CheckConnstringUpdated()
+CheckConnstringUpdated(void)
 {
 	if (!PagestoreShmemIsValid())
 		return false;
-	return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
+	return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
 }

 static void
-ReloadConnstring()
+ReloadConnstring(void)
 {
+	uint64		begin_update_counter;
+	uint64		end_update_counter;
+
 	if (!PagestoreShmemIsValid())
 		return;
-	LWLockAcquire(pagestore_shared->lock, LW_SHARED);
-	strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
-	pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
-	LWLockRelease(pagestore_shared->lock);
+
+	/*
+	 * Copy the current settnig from shared to local memory. Postmaster can
+	 * update the value concurrently, in which case we would copy a garbled
+	 * mix of the old and new values. We will detect it because the counter's
+	 * won't match, and retry. But it's important that we don't do anything
+	 * within the retry-loop that would depend on the string having valid
+	 * contents.
+	 */
+	do
+	{
+		begin_update_counter = pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
+		end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter);
+		pg_read_barrier();
+
+		strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
+		pg_read_barrier();
+	}
+	while (begin_update_counter != end_update_counter
+		   || begin_update_counter != pg_atomic_read_u64(&pagestore_shared->begin_update_counter)
+		   || end_update_counter != pg_atomic_read_u64(&pagestore_shared->end_update_counter));
+
+	pagestore_local_counter = end_update_counter;
 }

 static bool
@@ -137,7 +180,7 @@ pageserver_connect(int elevel)
 	static TimestampTz last_connect_time = 0;
 	static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
 	TimestampTz now;
-        uint64_t us_since_last_connect;
+	uint64_t	us_since_last_connect;

 	Assert(!connected);

@@ -147,7 +190,7 @@ pageserver_connect(int elevel)
 	}

 	now = GetCurrentTimestamp();
-        us_since_last_connect = now - last_connect_time;
+	us_since_last_connect = now - last_connect_time;
 	if (us_since_last_connect < delay_us)
 	{
 		pg_usleep(delay_us - us_since_last_connect);
@@ -505,8 +548,8 @@ PagestoreShmemInit(void)
 									   &found);
 	if (!found)
 	{
-		pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
-		pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
+		pg_atomic_init_u64(&pagestore_shared->begin_update_counter, 0);
+		pg_atomic_init_u64(&pagestore_shared->end_update_counter, 0);
 		AssignPageserverConnstring(page_server_connstring, NULL);
 	}
 	LWLockRelease(AddinShmemInitLock);
@@ -531,7 +574,6 @@ pagestore_shmem_request(void)
 #endif

 	RequestAddinShmemSpace(PagestoreShmemSize());
-	RequestNamedLWLockTranche("neon_libpagestore", 1);
 }

 static void
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -990,7 +990,7 @@ nm_pack_request(NeonRequest *msg)
 		case T_NeonErrorResponse:
 		case T_NeonDbSizeResponse:
 		default:
-			elog(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
+			neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
 			break;
 	}
 	return s;
@@ -1085,7 +1085,7 @@ nm_unpack_response(StringInfo s)
 		case T_NeonGetPageRequest:
 		case T_NeonDbSizeRequest:
 		default:
-			elog(ERROR, "unexpected neon message tag 0x%02x", tag);
+			neon_log(ERROR, "unexpected neon message tag 0x%02x", tag);
 			break;
 	}

@@ -1277,7 +1277,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 		XLogFlush(recptr);
 		lsn = recptr;
 		ereport(SmgrTrace,
-				(errmsg("Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
+				(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
 						blocknum,
 						RelFileInfoFmt(InfoFromSMgrRel(reln)),
 						forknum, LSN_FORMAT_ARGS(lsn))));
@@ -1305,7 +1305,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 		if (PageIsNew((Page) buffer))
 		{
 			ereport(SmgrTrace,
-					(errmsg("Page %u of relation %u/%u/%u.%u is all-zeros",
+					(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is all-zeros",
 							blocknum,
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
@@ -1313,7 +1313,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 		else if (PageIsEmptyHeapPage((Page) buffer))
 		{
 			ereport(SmgrTrace,
-					(errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
+					(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
 							blocknum,
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
@@ -1321,7 +1321,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 		else
 		{
 			ereport(PANIC,
-					(errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
+					(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
 							blocknum,
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
@@ -1330,7 +1330,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 	else
 	{
 		ereport(SmgrTrace,
-				(errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
+				(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
 						blocknum,
 						RelFileInfoFmt(InfoFromSMgrRel(reln)),
 						forknum, LSN_FORMAT_ARGS(lsn))));
@@ -1430,7 +1430,7 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
 		lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
 		lsn = nm_adjust_lsn(lsn);

-		elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
+		neon_log(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
 			 (uint32) ((lsn) >> 32), (uint32) (lsn));
 	}
 	else
@@ -1445,7 +1445,7 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
 		*latest = true;
 		lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
 		Assert(lsn != InvalidXLogRecPtr);
-		elog(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
+		neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
 			 (uint32) ((lsn) >> 32), (uint32) (lsn));

 		lsn = nm_adjust_lsn(lsn);
@@ -1465,7 +1465,7 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
 #endif
 		if (lsn > flushlsn)
 		{
-			elog(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
+			neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
 				 (uint32) (lsn >> 32), (uint32) lsn,
 				 (uint32) (flushlsn >> 32), (uint32) flushlsn);
 			XLogFlush(lsn);
@@ -1509,7 +1509,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 			return mdexists(reln, forkNum);

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

 	if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks))
@@ -1561,7 +1561,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 		case T_NeonErrorResponse:
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
-					 errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+					 errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forkNum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
@@ -1570,7 +1570,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 			break;

 		default:
-			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
 	}
 	pfree(resp);
 	return exists;
@@ -1587,7 +1587,7 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
-			elog(ERROR, "cannot call smgrcreate() on rel with unknown persistence");
+			neon_log(ERROR, "cannot call smgrcreate() on rel with unknown persistence");

 		case RELPERSISTENCE_PERMANENT:
 			break;
@@ -1598,10 +1598,10 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 			return;

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	elog(SmgrTrace, "Create relation %u/%u/%u.%u",
+	neon_log(SmgrTrace, "Create relation %u/%u/%u.%u",
 		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 		 forkNum);

@@ -1696,7 +1696,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
-			elog(ERROR, "cannot call smgrextend() on rel with unknown persistence");
+			neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");

 		case RELPERSISTENCE_PERMANENT:
 			break;
@@ -1707,7 +1707,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			return;

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

 	/*
@@ -1745,7 +1745,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1);

 	lsn = PageGetLSN((Page) buffer);
-	elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
+	neon_log(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
 		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 		 forkNum, blkno,
 		 (uint32) (lsn >> 32), (uint32) lsn);
@@ -1785,7 +1785,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
-			elog(ERROR, "cannot call smgrextend() on rel with unknown persistence");
+			neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");

 		case RELPERSISTENCE_PERMANENT:
 			break;
@@ -1796,7 +1796,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 			return;

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

 	if (max_cluster_size > 0 &&
@@ -1808,7 +1808,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 		if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
 			ereport(ERROR,
 					(errcode(ERRCODE_DISK_FULL),
-					 errmsg("could not extend file because cluster size limit (%d MB) has been exceeded",
+					 errmsg("could not extend file because project size limit (%d MB) has been exceeded",
 							max_cluster_size),
 					 errhint("This limit is defined by neon.max_cluster_size GUC")));
 	}
@@ -1821,7 +1821,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-				 errmsg("cannot extend file \"%s\" beyond %u blocks",
+				 errmsg(NEON_TAG "cannot extend file \"%s\" beyond %u blocks",
 						relpath(reln->smgr_rlocator, forkNum),
 						InvalidBlockNumber)));

@@ -1882,7 +1882,7 @@ neon_open(SMgrRelation reln)
 	mdopen(reln);

 	/* no work */
-	elog(SmgrTrace, "[NEON_SMGR] open noop");
+	neon_log(SmgrTrace, "open noop");
 }

 /*
@@ -1919,7 +1919,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 			return mdprefetch(reln, forknum, blocknum);

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

 	if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
@@ -1964,11 +1964,11 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 			return;

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

 	/* not implemented */
-	elog(SmgrTrace, "[NEON_SMGR] writeback noop");
+	neon_log(SmgrTrace, "writeback noop");

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -2098,7 +2098,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		case T_NeonErrorResponse:
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
-					 errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
+					 errmsg(NEON_TAG "could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							blkno,
 							RelFileInfoFmt(rinfo),
 							forkNum,
@@ -2107,7 +2107,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
 		default:
-			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
 	}

 	/* buffer was used, clean up for later reuse */
@@ -2131,7 +2131,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
-			elog(ERROR, "cannot call smgrread() on rel with unknown persistence");
+			neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");

 		case RELPERSISTENCE_PERMANENT:
 			break;
@@ -2142,7 +2142,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 			return;

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

 	/* Try to read from local file cache */
@@ -2170,7 +2170,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 		{
 			if (!PageIsNew((Page) pageserver_masked))
 			{
-				elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
+				neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
 					 blkno,
 					 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 					 forkNum,
@@ -2180,7 +2180,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 		}
 		else if (PageIsNew((Page) buffer))
 		{
-			elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
+			neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
 				 blkno,
 				 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 				 forkNum,
@@ -2195,7 +2195,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer

 			if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
 			{
-				elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
+				neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
 					 blkno,
 					 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 					 forkNum,
@@ -2214,7 +2214,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer

 				if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
 				{
-					elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
+					neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
 						 blkno,
 						 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 						 forkNum,
@@ -2294,13 +2294,13 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 			return;

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

 	neon_wallog_page(reln, forknum, blocknum, buffer, false);

 	lsn = PageGetLSN((Page) buffer);
-	elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
+	neon_log(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
 		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 		 forknum, blocknum,
 		 (uint32) (lsn >> 32), (uint32) lsn);
@@ -2327,7 +2327,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
-			elog(ERROR, "cannot call smgrnblocks() on rel with unknown persistence");
+			neon_log(ERROR, "cannot call smgrnblocks() on rel with unknown persistence");
 			break;

 		case RELPERSISTENCE_PERMANENT:
@@ -2338,12 +2338,12 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			return mdnblocks(reln, forknum);

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

 	if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
 	{
-		elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
+		neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
 			 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 			 forknum, n_blocks);
 		return n_blocks;
@@ -2371,7 +2371,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 		case T_NeonErrorResponse:
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
-					 errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+					 errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
@@ -2380,11 +2380,11 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			break;

 		default:
-			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
 	}
 	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);

-	elog(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
+	neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
 		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 		 forknum,
 		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
@@ -2427,7 +2427,7 @@ neon_dbsize(Oid dbNode)
 		case T_NeonErrorResponse:
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
-					 errmsg("could not read db size of db %u from page server at lsn %X/%08X",
+					 errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X",
 							dbNode,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
 					 errdetail("page server returned error: %s",
@@ -2435,10 +2435,10 @@ neon_dbsize(Oid dbNode)
 			break;

 		default:
-			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
 	}

-	elog(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
+	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
 		 dbNode,
 		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
 		 db_size);
@@ -2458,7 +2458,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
-			elog(ERROR, "cannot call smgrtruncate() on rel with unknown persistence");
+			neon_log(ERROR, "cannot call smgrtruncate() on rel with unknown persistence");
 			break;

 		case RELPERSISTENCE_PERMANENT:
@@ -2470,7 +2470,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 			return;

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

 	set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);
@@ -2526,7 +2526,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
-			elog(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence");
+			neon_log(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence");
 			break;

 		case RELPERSISTENCE_PERMANENT:
@@ -2538,10 +2538,10 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
 			return;

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	elog(SmgrTrace, "[NEON_SMGR] immedsync noop");
+	neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -2566,17 +2566,17 @@ neon_start_unlogged_build(SMgrRelation reln)
 	 * progress at a time. That's enough for the current usage.
 	 */
 	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
-		elog(ERROR, "unlogged relation build is already in progress");
+		neon_log(ERROR, "unlogged relation build is already in progress");
 	Assert(unlogged_build_rel == NULL);

 	ereport(SmgrTrace,
-			(errmsg("starting unlogged build of relation %u/%u/%u",
+			(errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u",
 					RelFileInfoFmt(InfoFromSMgrRel(reln)))));

 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
-			elog(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
+			neon_log(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
 			break;

 		case RELPERSISTENCE_PERMANENT:
@@ -2589,11 +2589,11 @@ neon_start_unlogged_build(SMgrRelation reln)
 			return;

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

 	if (smgrnblocks(reln, MAIN_FORKNUM) != 0)
-		elog(ERROR, "cannot perform unlogged index build, index is not empty ");
+		neon_log(ERROR, "cannot perform unlogged index build, index is not empty ");

 	unlogged_build_rel = reln;
 	unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
@@ -2620,7 +2620,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 	Assert(unlogged_build_rel == reln);

 	ereport(SmgrTrace,
-			(errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u",
+			(errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u",
 					RelFileInfoFmt(InfoFromSMgrRel(reln)))));

 	if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
@@ -2649,7 +2649,7 @@ neon_end_unlogged_build(SMgrRelation reln)
 	Assert(unlogged_build_rel == reln);

 	ereport(SmgrTrace,
-			(errmsg("ending unlogged build of relation %u/%u/%u",
+			(errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u",
 					RelFileInfoFmt(InfoFromNInfoB(rinfob)))));

 	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
@@ -2664,7 +2664,7 @@ neon_end_unlogged_build(SMgrRelation reln)
 		rinfob = InfoBFromSMgrRel(reln);
 		for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
 		{
-			elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
+			neon_log(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
 				 RelFileInfoFmt(InfoFromNInfoB(rinfob)),
 				 forknum);

@@ -2707,7 +2707,7 @@ AtEOXact_neon(XactEvent event, void *arg)
 				unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 				ereport(ERROR,
 						(errcode(ERRCODE_INTERNAL_ERROR),
-						 (errmsg("unlogged index build was not properly finished"))));
+						 (errmsg(NEON_TAG "unlogged index build was not properly finished"))));
 			}
 			break;
 	}
@@ -2806,14 +2806,14 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		set_cached_relsize(rinfo, forknum, relsize);
 		SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);

-		elog(SmgrTrace, "Set length to %d", relsize);
+		neon_log(SmgrTrace, "Set length to %d", relsize);
 	}
 }

 #define FSM_TREE_DEPTH	((SlotsPerFSMPage >= 1626) ? 3 : 4)

 /*
- * TODO: May be it is better to make correspondent fgunctio from freespace.c public?
+ * TODO: May be it is better to make correspondent function from freespace.c public?
 */
 static BlockNumber
 get_fsm_physical_block(BlockNumber heapblk)
@@ -2894,7 +2894,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)

 #if PG_VERSION_NUM < 150000
 	if (!XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno))
-		elog(PANIC, "failed to locate backup block with ID %d", block_id);
+		neon_log(PANIC, "failed to locate backup block with ID %d", block_id);
 #else
 	XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno);
 #endif
--- a/pgxn/neon/relsize_cache.c
+++ b/pgxn/neon/relsize_cache.c
@@ -40,11 +40,23 @@ typedef struct
 {
 	RelTag		tag;
 	BlockNumber size;
+	dlist_node	lru_node;		/* LRU list node */
 } RelSizeEntry;

+typedef struct
+{
+	size_t      size;
+	uint64		hits;
+	uint64		misses;
+	uint64		writes;
+	dlist_head	lru;			/* double linked list for LRU replacement
+								 * algorithm */
+} RelSizeHashControl;
+
 static HTAB *relsize_hash;
 static LWLockId relsize_lock;
 static int	relsize_hash_size;
+static RelSizeHashControl* relsize_ctl;
 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
 #if PG_VERSION_NUM >= 150000
 static shmem_request_hook_type prev_shmem_request_hook = NULL;
@@ -52,7 +64,7 @@ static void relsize_shmem_request(void);
 #endif

 /*
- * Size of a cache entry is 20 bytes. So this default will take about 1.2 MB,
+ * Size of a cache entry is 36 bytes. So this default will take about 2.3 MB,
 * which seems reasonable.
 */
 #define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024)
@@ -61,19 +73,29 @@ static void
 neon_smgr_shmem_startup(void)
 {
 	static HASHCTL info;
+	bool found;

 	if (prev_shmem_startup_hook)
 		prev_shmem_startup_hook();

 	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
-	relsize_lock = (LWLockId) GetNamedLWLockTranche("neon_relsize");
-	info.keysize = sizeof(RelTag);
-	info.entrysize = sizeof(RelSizeEntry);
-	relsize_hash = ShmemInitHash("neon_relsize",
-								 relsize_hash_size, relsize_hash_size,
-								 &info,
-								 HASH_ELEM | HASH_BLOBS);
-	LWLockRelease(AddinShmemInitLock);
+	relsize_ctl = (RelSizeHashControl *) ShmemInitStruct("relsize_hash", sizeof(RelSizeHashControl), &found);
+	if (!found)
+	{
+		relsize_lock = (LWLockId) GetNamedLWLockTranche("neon_relsize");
+		info.keysize = sizeof(RelTag);
+		info.entrysize = sizeof(RelSizeEntry);
+		relsize_hash = ShmemInitHash("neon_relsize",
+									 relsize_hash_size, relsize_hash_size,
+									 &info,
+									 HASH_ELEM | HASH_BLOBS);
+		LWLockRelease(AddinShmemInitLock);
+		relsize_ctl->size = 0;
+		relsize_ctl->hits = 0;
+		relsize_ctl->misses = 0;
+		relsize_ctl->writes = 0;
+		dlist_init(&relsize_ctl->lru);
+	}
 }

 bool
@@ -93,7 +115,15 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
 		if (entry != NULL)
 		{
 			*size = entry->size;
+			relsize_ctl->hits += 1;
 			found = true;
+			/* Move entry to the LRU list tail */
+			dlist_delete(&entry->lru_node);
+			dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
+		}
+		else
+		{
+			relsize_ctl->misses += 1;
 		}
 		LWLockRelease(relsize_lock);
 	}
@@ -107,12 +137,43 @@ set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 	{
 		RelTag		tag;
 		RelSizeEntry *entry;
+		bool		found = false;

 		tag.rinfo = rinfo;
 		tag.forknum = forknum;
 		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
-		entry = hash_search(relsize_hash, &tag, HASH_ENTER, NULL);
+		/*
+		 * This should actually never happen! Below we check if hash is full and delete least recently user item in this case.
+		 * But for further safety we also perform check here.
+		 */
+		while ((entry = hash_search(relsize_hash, &tag, HASH_ENTER_NULL, &found)) == NULL)
+		{
+			RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
+			hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
+			Assert(relsize_ctl->size > 0);
+			relsize_ctl->size -= 1;
+		}
 		entry->size = size;
+		if (!found)
+		{
+			if (++relsize_ctl->size == relsize_hash_size)
+			{
+				/*
+				 * Remove least recently used elment from the hash.
+				 * Hash size after is becomes `relsize_hash_size-1`.
+				 * But it is not considered to be a problem, because size of this hash is expecrted large enough and +-1 doesn't matter.
+				 */
+				RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
+				hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
+				relsize_ctl->size -= 1;
+			}
+		}
+		else
+		{
+			dlist_delete(&entry->lru_node);
+		}
+		dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
+		relsize_ctl->writes += 1;
 		LWLockRelease(relsize_lock);
 	}
 }
@@ -132,6 +193,21 @@ update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 		entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found);
 		if (!found || entry->size < size)
 			entry->size = size;
+		if (!found)
+		{
+			if (++relsize_ctl->size == relsize_hash_size)
+			{
+				RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
+				hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
+				relsize_ctl->size -= 1;
+			}
+		}
+		else
+		{
+			dlist_delete(&entry->lru_node);
+		}
+		relsize_ctl->writes += 1;
+		dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
 		LWLockRelease(relsize_lock);
 	}
 }
@@ -142,11 +218,16 @@ forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum)
 	if (relsize_hash_size > 0)
 	{
 		RelTag		tag;
-
+		RelSizeEntry *entry;
 		tag.rinfo = rinfo;
 		tag.forknum = forknum;
 		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
-		hash_search(relsize_hash, &tag, HASH_REMOVE, NULL);
+		entry = hash_search(relsize_hash, &tag, HASH_REMOVE, NULL);
+		if (entry)
+		{
+			dlist_delete(&entry->lru_node);
+			relsize_ctl->size -= 1;
+		}
 		LWLockRelease(relsize_lock);
 	}
 }
@@ -191,7 +272,7 @@ relsize_shmem_request(void)
 	if (prev_shmem_request_hook)
 		prev_shmem_request_hook();

-	RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry)));
+	RequestAddinShmemSpace(sizeof(RelSizeHashControl) + hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry)));
 	RequestNamedLWLockTranche("neon_relsize", 1);
 }
 #endif
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -959,8 +959,8 @@ DetermineEpochStartLsn(WalProposer *wp)
 	}

 	/*
-	 * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing
-	 * was committed yet. Start streaming then from the basebackup LSN.
+	 * If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are bootstrapping
+	 * and nothing was committed yet. Start streaming then from the basebackup LSN.
 	 */
 	if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers)
 	{
@@ -973,12 +973,13 @@ DetermineEpochStartLsn(WalProposer *wp)
 	}

 	/*
-	 * If propEpochStartLsn is not 0, at least one msg with WAL was sent to
-	 * some connected safekeeper; it must have carried truncateLsn pointing to
-	 * the first record.
+	 * Safekeepers are setting truncateLsn after timelineStartLsn is known, so it
+	 * should never be zero at this point, if we know timelineStartLsn.
+	 * 
+	 * timelineStartLsn can be zero only on the first syncSafekeepers run.
 	 */
 	Assert((wp->truncateLsn != InvalidXLogRecPtr) ||
-		   (wp->config->syncSafekeepers && wp->truncateLsn == wp->propEpochStartLsn));
+		   (wp->config->syncSafekeepers && wp->truncateLsn == wp->timelineStartLsn));

 	/*
 	 * We will be generating WAL since propEpochStartLsn, so we should set
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -5,7 +5,7 @@ edition.workspace = true
 license.workspace = true

 [features]
-default = ["testing"]
+default = []
 testing = []

 [dependencies]
@@ -89,3 +89,4 @@ camino-tempfile.workspace = true
 rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
+walkdir.workspace = true
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -10,6 +10,7 @@ use crate::auth::credentials::check_peer_addr_is_in_list;
 use crate::auth::validate_password_and_exchange;
 use crate::cache::Cached;
 use crate::console::errors::GetAuthInfoError;
+use crate::console::provider::ConsoleBackend;
 use crate::console::AuthSecret;
 use crate::context::RequestMonitoring;
 use crate::proxy::connect_compute::handle_try_wake;
@@ -43,11 +44,8 @@ use tracing::{error, info, warn};
 ///   this helps us provide the credentials only to those auth
 ///   backends which require them for the authentication process.
 pub enum BackendType<'a, T> {
-    /// Current Cloud API (V2).
-    Console(Cow<'a, console::provider::neon::Api>, T),
-    /// Local mock of Cloud API (V2).
-    #[cfg(feature = "testing")]
-    Postgres(Cow<'a, console::provider::mock::Api>, T),
+    /// Cloud API (V2).
+    Console(Cow<'a, ConsoleBackend>, T),
    /// Authentication via a web browser.
    Link(Cow<'a, url::ApiUrl>),
    #[cfg(test)]
@@ -64,9 +62,15 @@ impl std::fmt::Display for BackendType<'_, ()> {
    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        use BackendType::*;
        match self {
-            Console(endpoint, _) => fmt.debug_tuple("Console").field(&endpoint.url()).finish(),
-            #[cfg(feature = "testing")]
-            Postgres(endpoint, _) => fmt.debug_tuple("Postgres").field(&endpoint.url()).finish(),
+            Console(api, _) => match &**api {
+                ConsoleBackend::Console(endpoint) => {
+                    fmt.debug_tuple("Console").field(&endpoint.url()).finish()
+                }
+                #[cfg(feature = "testing")]
+                ConsoleBackend::Postgres(endpoint) => {
+                    fmt.debug_tuple("Postgres").field(&endpoint.url()).finish()
+                }
+            },
            Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
            #[cfg(test)]
            Test(_) => fmt.debug_tuple("Test").finish(),
@@ -81,8 +85,6 @@ impl<T> BackendType<'_, T> {
        use BackendType::*;
        match self {
            Console(c, x) => Console(Cow::Borrowed(c), x),
-            #[cfg(feature = "testing")]
-            Postgres(c, x) => Postgres(Cow::Borrowed(c), x),
            Link(c) => Link(Cow::Borrowed(c)),
            #[cfg(test)]
            Test(x) => Test(*x),
@@ -98,8 +100,6 @@ impl<'a, T> BackendType<'a, T> {
        use BackendType::*;
        match self {
            Console(c, x) => Console(c, f(x)),
-            #[cfg(feature = "testing")]
-            Postgres(c, x) => Postgres(c, f(x)),
            Link(c) => Link(c),
            #[cfg(test)]
            Test(x) => Test(x),
@@ -114,8 +114,6 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
        use BackendType::*;
        match self {
            Console(c, x) => x.map(|x| Console(c, x)),
-            #[cfg(feature = "testing")]
-            Postgres(c, x) => x.map(|x| Postgres(c, x)),
            Link(c) => Ok(Link(c)),
            #[cfg(test)]
            Test(x) => Ok(Test(x)),
@@ -325,8 +323,6 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {

        match self {
            Console(_, user_info) => user_info.project.clone(),
-            #[cfg(feature = "testing")]
-            Postgres(_, user_info) => user_info.project.clone(),
            Link(_) => Some("link".into()),
            #[cfg(test)]
            Test(_) => Some("test".into()),
@@ -339,8 +335,6 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {

        match self {
            Console(_, user_info) => &user_info.user,
-            #[cfg(feature = "testing")]
-            Postgres(_, user_info) => &user_info.user,
            Link(_) => "link",
            #[cfg(test)]
            Test(_) => "test",
@@ -371,19 +365,6 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
                        .await?;
                (cache_info, BackendType::Console(api, user_info))
            }
-            #[cfg(feature = "testing")]
-            Postgres(api, user_info) => {
-                info!(
-                    user = &*user_info.user,
-                    project = user_info.project(),
-                    "performing authentication using a local postgres instance"
-                );
-
-                let (cache_info, user_info) =
-                    auth_and_wake_compute(ctx, &*api, user_info, client, allow_cleartext, config)
-                        .await?;
-                (cache_info, BackendType::Postgres(api, user_info))
-            }
            // NOTE: this auth backend doesn't use client credentials.
            Link(url) => {
                info!("performing link authentication");
@@ -414,8 +395,6 @@ impl BackendType<'_, ComputeUserInfo> {
        use BackendType::*;
        match self {
            Console(api, user_info) => api.get_allowed_ips(ctx, user_info).await,
-            #[cfg(feature = "testing")]
-            Postgres(api, user_info) => api.get_allowed_ips(ctx, user_info).await,
            Link(_) => Ok(Cached::new_uncached(Arc::new(vec![]))),
            #[cfg(test)]
            Test(x) => Ok(Cached::new_uncached(Arc::new(x.get_allowed_ips()?))),
@@ -432,8 +411,6 @@ impl BackendType<'_, ComputeUserInfo> {

        match self {
            Console(api, user_info) => api.wake_compute(ctx, user_info).map_ok(Some).await,
-            #[cfg(feature = "testing")]
-            Postgres(api, user_info) => api.wake_compute(ctx, user_info).map_ok(Some).await,
            Link(_) => Ok(None),
            #[cfg(test)]
            Test(x) => x.wake_compute().map(Some),
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -57,24 +57,31 @@ pub(super) async fn authenticate(
    link_uri: &reqwest::Url,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
-    let psql_session_id = new_psql_session_id();
+    // registering waiter can fail if we get unlucky with rng.
+    // just try again.
+    let (psql_session_id, waiter) = loop {
+        let psql_session_id = new_psql_session_id();
+
+        match console::mgmt::get_waiter(&psql_session_id) {
+            Ok(waiter) => break (psql_session_id, waiter),
+            Err(_e) => continue,
+        }
+    };
+
    let span = info_span!("link", psql_session_id = &psql_session_id);
    let greeting = hello_message(link_uri, &psql_session_id);

-    let db_info = console::mgmt::with_waiter(psql_session_id, |waiter| async {
-        // Give user a URL to spawn a new database.
-        info!(parent: &span, "sending the auth URL to the user");
-        client
-            .write_message_noflush(&Be::AuthenticationOk)?
-            .write_message_noflush(&Be::CLIENT_ENCODING)?
-            .write_message(&Be::NoticeResponse(&greeting))
-            .await?;
+    // Give user a URL to spawn a new database.
+    info!(parent: &span, "sending the auth URL to the user");
+    client
+        .write_message_noflush(&Be::AuthenticationOk)?
+        .write_message_noflush(&Be::CLIENT_ENCODING)?
+        .write_message(&Be::NoticeResponse(&greeting))
+        .await?;

-        // Wait for web console response (see `mgmt`).
-        info!(parent: &span, "waiting for console's reply...");
-        waiter.await?.map_err(LinkAuthError::AuthFailed)
-    })
-    .await?;
+    // Wait for web console response (see `mgmt`).
+    info!(parent: &span, "waiting for console's reply...");
+    let db_info = waiter.await.map_err(LinkAuthError::from)?;

    client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;

--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -249,12 +249,19 @@ async fn main() -> anyhow::Result<()> {
    }

    if let auth::BackendType::Console(api, _) = &config.auth_backend {
-        let cache = api.caches.project_info.clone();
-        if let Some(url) = args.redis_notifications {
-            info!("Starting redis notifications listener ({url})");
-            maintenance_tasks.spawn(notifications::task_main(url.to_owned(), cache.clone()));
+        match &**api {
+            proxy::console::provider::ConsoleBackend::Console(api) => {
+                let cache = api.caches.project_info.clone();
+                if let Some(url) = args.redis_notifications {
+                    info!("Starting redis notifications listener ({url})");
+                    maintenance_tasks
+                        .spawn(notifications::task_main(url.to_owned(), cache.clone()));
+                }
+                maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
+            }
+            #[cfg(feature = "testing")]
+            proxy::console::provider::ConsoleBackend::Postgres(_) => {}
        }
-        maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
    }

    let maintenance = loop {
@@ -351,13 +358,15 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
            let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config));

            let api = console::provider::neon::Api::new(endpoint, caches, locks);
+            let api = console::provider::ConsoleBackend::Console(api);
            auth::BackendType::Console(Cow::Owned(api), ())
        }
        #[cfg(feature = "testing")]
        AuthBackend::Postgres => {
            let url = args.auth_endpoint.parse()?;
            let api = console::provider::mock::Api::new(url);
-            auth::BackendType::Postgres(Cow::Owned(api), ())
+            let api = console::provider::ConsoleBackend::Postgres(api);
+            auth::BackendType::Console(Cow::Owned(api), ())
        }
        AuthBackend::Link => {
            let url = args.uri.parse()?;
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -266,7 +266,7 @@ impl ProjectInfoCacheImpl {
            tokio::time::interval(self.config.gc_interval / (self.cache.shards().len()) as u32);
        loop {
            interval.tick().await;
-            if self.cache.len() <= self.config.size {
+            if self.cache.len() < self.config.size {
                // If there are not too many entries, wait until the next gc cycle.
                continue;
            }
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -13,16 +13,10 @@ use tracing::{error, info, info_span, Instrument};
 static CPLANE_WAITERS: Lazy<Waiters<ComputeReady>> = Lazy::new(Default::default);

 /// Give caller an opportunity to wait for the cloud's reply.
-pub async fn with_waiter<R, T, E>(
+pub fn get_waiter(
    psql_session_id: impl Into<String>,
-    action: impl FnOnce(Waiter<'static, ComputeReady>) -> R,
-) -> Result<T, E>
-where
-    R: std::future::Future<Output = Result<T, E>>,
-    E: From<waiters::RegisterError>,
-{
-    let waiter = CPLANE_WAITERS.register(psql_session_id.into())?;
-    action(waiter).await
+) -> Result<Waiter<'static, ComputeReady>, waiters::RegisterError> {
+    CPLANE_WAITERS.register(psql_session_id.into())
 }

 pub fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::NotifyError> {
@@ -77,7 +71,7 @@ async fn handle_connection(socket: TcpStream) -> Result<(), QueryError> {
 }

 /// A message received by `mgmt` when a compute node is ready.
-pub type ComputeReady = Result<DatabaseInfo, String>;
+pub type ComputeReady = DatabaseInfo;

 // TODO: replace with an http-based protocol.
 struct MgmtHandler;
@@ -102,7 +96,7 @@ fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), Qu
    let _enter = span.enter();
    info!("got response: {:?}", resp.result);

-    match notify(resp.session_id, Ok(resp.result)) {
+    match notify(resp.session_id, resp.result) {
        Ok(()) => {
            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
                .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -248,23 +248,75 @@ pub trait Api {
    async fn get_role_secret(
        &self,
        ctx: &mut RequestMonitoring,
-        creds: &ComputeUserInfo,
+        user_info: &ComputeUserInfo,
    ) -> Result<Option<CachedRoleSecret>, errors::GetAuthInfoError>;

    async fn get_allowed_ips(
        &self,
        ctx: &mut RequestMonitoring,
-        creds: &ComputeUserInfo,
+        user_info: &ComputeUserInfo,
    ) -> Result<CachedAllowedIps, errors::GetAuthInfoError>;

    /// Wake up the compute node and return the corresponding connection info.
    async fn wake_compute(
        &self,
        ctx: &mut RequestMonitoring,
-        creds: &ComputeUserInfo,
+        user_info: &ComputeUserInfo,
    ) -> Result<CachedNodeInfo, errors::WakeComputeError>;
 }

+#[derive(Clone)]
+pub enum ConsoleBackend {
+    /// Current Cloud API (V2).
+    Console(neon::Api),
+    /// Local mock of Cloud API (V2).
+    #[cfg(feature = "testing")]
+    Postgres(mock::Api),
+}
+
+#[async_trait]
+impl Api for ConsoleBackend {
+    async fn get_role_secret(
+        &self,
+        ctx: &mut RequestMonitoring,
+        user_info: &ComputeUserInfo,
+    ) -> Result<Option<CachedRoleSecret>, errors::GetAuthInfoError> {
+        use ConsoleBackend::*;
+        match self {
+            Console(api) => api.get_role_secret(ctx, user_info).await,
+            #[cfg(feature = "testing")]
+            Postgres(api) => api.get_role_secret(ctx, user_info).await,
+        }
+    }
+
+    async fn get_allowed_ips(
+        &self,
+        ctx: &mut RequestMonitoring,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedIps, errors::GetAuthInfoError> {
+        use ConsoleBackend::*;
+        match self {
+            Console(api) => api.get_allowed_ips(ctx, user_info).await,
+            #[cfg(feature = "testing")]
+            Postgres(api) => api.get_allowed_ips(ctx, user_info).await,
+        }
+    }
+
+    async fn wake_compute(
+        &self,
+        ctx: &mut RequestMonitoring,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
+        use ConsoleBackend::*;
+
+        match self {
+            Console(api) => api.wake_compute(ctx, user_info).await,
+            #[cfg(feature = "testing")]
+            Postgres(api) => api.wake_compute(ctx, user_info).await,
+        }
+    }
+}
+
 /// Various caches for [`console`](super).
 pub struct ApiCaches {
    /// Cache for the `wake_compute` API method.
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -179,17 +179,18 @@ impl super::Api for Api {
            return Ok(Some(role_secret));
        }
        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
-        let project_id = auth_info.project_id.unwrap_or(ep.clone());
-        if let Some(secret) = &auth_info.secret {
-            self.caches
-                .project_info
-                .insert_role_secret(&project_id, ep, user, secret.clone())
+        if let Some(project_id) = auth_info.project_id {
+            if let Some(secret) = &auth_info.secret {
+                self.caches
+                    .project_info
+                    .insert_role_secret(&project_id, ep, user, secret.clone())
+            }
+            self.caches.project_info.insert_allowed_ips(
+                &project_id,
+                ep,
+                Arc::new(auth_info.allowed_ips),
+            );
        }
-        self.caches.project_info.insert_allowed_ips(
-            &project_id,
-            ep,
-            Arc::new(auth_info.allowed_ips),
-        );
        // When we just got a secret, we don't need to invalidate it.
        Ok(auth_info.secret.map(Cached::new_uncached))
    }
@@ -212,15 +213,16 @@ impl super::Api for Api {
        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
        let allowed_ips = Arc::new(auth_info.allowed_ips);
        let user = &user_info.user;
-        let project_id = auth_info.project_id.unwrap_or(ep.clone());
-        if let Some(secret) = &auth_info.secret {
+        if let Some(project_id) = auth_info.project_id {
+            if let Some(secret) = &auth_info.secret {
+                self.caches
+                    .project_info
+                    .insert_role_secret(&project_id, ep, user, secret.clone())
+            }
            self.caches
                .project_info
-                .insert_role_secret(&project_id, ep, user, secret.clone())
+                .insert_allowed_ips(&project_id, ep, allowed_ips.clone());
        }
-        self.caches
-            .project_info
-            .insert_allowed_ips(&project_id, ep, allowed_ips.clone());
        Ok(Cached::new_uncached(allowed_ips))
    }

--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -32,6 +32,7 @@ pub struct RequestMonitoring {
    user: Option<SmolStr>,
    application: Option<SmolStr>,
    error_kind: Option<ErrorKind>,
+    success: bool,

    // extra
    // This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -59,6 +60,7 @@ impl RequestMonitoring {
            user: None,
            application: None,
            error_kind: None,
+            success: false,

            sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
            latency_timer: LatencyTimer::new(protocol),
@@ -96,6 +98,10 @@ impl RequestMonitoring {
        self.user = Some(user);
    }

+    pub fn set_success(&mut self) {
+        self.success = true;
+    }
+
    pub fn log(&mut self) {
        if let Some(tx) = self.sender.take() {
            let _: Result<(), _> = tx.send(self.clone());
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -1,7 +1,8 @@
-use std::sync::Arc;
+use std::{sync::Arc, time::SystemTime};

 use anyhow::Context;
 use bytes::BytesMut;
+use chrono::{Datelike, Timelike};
 use futures::{Stream, StreamExt};
 use parquet::{
    basic::Compression,
@@ -86,6 +87,12 @@ struct RequestData {
    project: Option<String>,
    branch: Option<String>,
    error: Option<&'static str>,
+    /// Success is counted if we form a HTTP response with sql rows inside
+    /// Or if we make it to proxy_pass
+    success: bool,
+    /// Tracks time from session start (HTTP request/libpq TCP handshake)
+    /// Through to success/failure
+    duration_us: u64,
 }

 impl From<RequestMonitoring> for RequestData {
@@ -102,6 +109,11 @@ impl From<RequestMonitoring> for RequestData {
            protocol: value.protocol,
            region: value.region,
            error: value.error_kind.as_ref().map(|e| e.to_str()),
+            success: value.success,
+            duration_us: SystemTime::from(value.first_packet)
+                .elapsed()
+                .unwrap_or_default()
+                .as_micros() as u64, // 584 millenia... good enough
        }
    }
 }
@@ -266,7 +278,13 @@ async fn upload_parquet(

    let compression = len as f64 / len_uncompressed as f64;
    let size = data.len();
-    let id = uuid::Uuid::now_v7();
+    let now = chrono::Utc::now();
+    let id = uuid::Uuid::new_v7(uuid::Timestamp::from_unix(
+        uuid::NoContext,
+        // we won't be running this in 1970. this cast is ok
+        now.timestamp() as u64,
+        now.timestamp_subsec_nanos(),
+    ));

    info!(
        %id,
@@ -274,7 +292,14 @@ async fn upload_parquet(
        size, compression, "uploading request parquet file"
    );

-    let path = RemotePath::from_string(&format!("requests_{id}.parquet"))?;
+    let year = now.year();
+    let month = now.month();
+    let day = now.day();
+    let hour = now.hour();
+    // segment files by time for S3 performance
+    let path = RemotePath::from_string(&format!(
+        "{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet"
+    ))?;
    backoff::retry(
        || async {
            let stream = futures::stream::once(futures::future::ready(Ok(data.clone())));
@@ -332,6 +357,7 @@ mod tests {
        DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
    };
    use tokio::{sync::mpsc, time};
+    use walkdir::WalkDir;

    use super::{worker_inner, ParquetConfig, ParquetUploadArgs, RequestData};

@@ -420,6 +446,8 @@ mod tests {
            protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
            region: "us-east-1",
            error: None,
+            success: rng.gen(),
+            duration_us: rng.gen_range(0..30_000_000),
        }
    }

@@ -442,9 +470,11 @@ mod tests {

        worker_inner(storage, rx, config).await.unwrap();

-        let mut files = std::fs::read_dir(tmpdir.as_std_path())
-            .unwrap()
-            .map(|entry| entry.unwrap().path())
+        let mut files = WalkDir::new(tmpdir.as_std_path())
+            .into_iter()
+            .filter_map(|entry| entry.ok())
+            .filter(|entry| entry.file_type().is_file())
+            .map(|entry| entry.path().to_path_buf())
            .collect_vec();
        files.sort();

@@ -485,15 +515,15 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1029153, 3, 6000),
-                (1029075, 3, 6000),
-                (1029216, 3, 6000),
-                (1029129, 3, 6000),
-                (1029250, 3, 6000),
-                (1029017, 3, 6000),
-                (1029175, 3, 6000),
-                (1029247, 3, 6000),
-                (343124, 1, 2000)
+                (1087635, 3, 6000),
+                (1087288, 3, 6000),
+                (1087444, 3, 6000),
+                (1087572, 3, 6000),
+                (1087468, 3, 6000),
+                (1087500, 3, 6000),
+                (1087533, 3, 6000),
+                (1087566, 3, 6000),
+                (362671, 1, 2000)
            ],
        );

@@ -523,11 +553,11 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1166201, 6, 12000),
-                (1163577, 6, 12000),
-                (1164641, 6, 12000),
-                (1168772, 6, 12000),
-                (196761, 1, 2000)
+                (1028637, 5, 10000),
+                (1031969, 5, 10000),
+                (1019900, 5, 10000),
+                (1020365, 5, 10000),
+                (1025010, 5, 10000)
            ],
        );

@@ -559,11 +589,11 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1144934, 6, 12000),
-                (1144941, 6, 12000),
-                (1144735, 6, 12000),
-                (1144936, 6, 12000),
-                (191035, 1, 2000)
+                (1210770, 6, 12000),
+                (1211036, 6, 12000),
+                (1210990, 6, 12000),
+                (1210861, 6, 12000),
+                (202073, 1, 2000)
            ],
        );

@@ -588,15 +618,15 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1029153, 3, 6000),
-                (1029075, 3, 6000),
-                (1029216, 3, 6000),
-                (1029129, 3, 6000),
-                (1029250, 3, 6000),
-                (1029017, 3, 6000),
-                (1029175, 3, 6000),
-                (1029247, 3, 6000),
-                (343124, 1, 2000)
+                (1087635, 3, 6000),
+                (1087288, 3, 6000),
+                (1087444, 3, 6000),
+                (1087572, 3, 6000),
+                (1087468, 3, 6000),
+                (1087500, 3, 6000),
+                (1087533, 3, 6000),
+                (1087566, 3, 6000),
+                (362671, 1, 2000)
            ],
        );

@@ -633,7 +663,7 @@ mod tests {
        // files are smaller than the size threshold, but they took too long to fill so were flushed early
        assert_eq!(
            file_stats,
-            [(515807, 2, 3001), (515585, 2, 3000), (515425, 2, 2999)],
+            [(545264, 2, 3001), (545025, 2, 3000), (544857, 2, 2999)],
        );

        tmpdir.close().unwrap();
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -356,6 +356,7 @@ pub async fn proxy_pass(
    compute: impl AsyncRead + AsyncWrite + Unpin,
    aux: MetricsAuxInfo,
 ) -> anyhow::Result<()> {
+    ctx.set_success();
    ctx.log();

    let usage = USAGE_METRICS.register(Ids {
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -160,8 +160,6 @@ where
    let node_info = loop {
        let wake_res = match user_info {
            auth::BackendType::Console(api, user_info) => api.wake_compute(ctx, user_info).await,
-            #[cfg(feature = "testing")]
-            auth::BackendType::Postgres(api, user_info) => api.wake_compute(ctx, user_info).await,
            // nothing to do?
            auth::BackendType::Link(_) => return Err(err.into()),
            // test backend
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -46,14 +46,11 @@ enum Notification {
 }
 #[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
 struct AllowedIpsUpdate {
-    #[serde(rename = "project")]
    project_id: SmolStr,
 }
 #[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
 struct PasswordUpdate {
-    #[serde(rename = "project")]
    project_id: SmolStr,
-    #[serde(rename = "role")]
    role_name: SmolStr,
 }
 fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
@@ -151,7 +148,7 @@ mod tests {
    #[test]
    fn parse_allowed_ips() -> anyhow::Result<()> {
        let project_id = "new_project".to_string();
-        let data = format!("{{\"project\": \"{project_id}\"}}");
+        let data = format!("{{\"project_id\": \"{project_id}\"}}");
        let text = json!({
            "type": "message",
            "topic": "/allowed_ips_updated",
@@ -177,7 +174,7 @@ mod tests {
    fn parse_password_updated() -> anyhow::Result<()> {
        let project_id = "new_project".to_string();
        let role_name = "new_role".to_string();
-        let data = format!("{{\"project\": \"{project_id}\", \"role\": \"{role_name}\"}}");
+        let data = format!("{{\"project_id\": \"{project_id}\", \"role_name\": \"{role_name}\"}}");
        let text = json!({
            "type": "message",
            "topic": "/password_updated",
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -26,7 +26,7 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};

 use crate::{
    auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list},
-    console,
+    console::{self, messages::MetricsAuxInfo},
    context::RequestMonitoring,
    metrics::NUM_DB_CONNECTIONS_GAUGE,
    proxy::connect_compute::ConnectMechanism,
@@ -362,6 +362,7 @@ impl GlobalConnPool {

        // ok return cached connection if found and establish a new one otherwise
        let new_client = if let Some(client) = client {
+            ctx.set_project(client.aux.clone());
            if client.inner.is_closed() {
                let conn_id = uuid::Uuid::new_v4();
                info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one");
@@ -593,10 +594,6 @@ async fn connect_to_compute_once(
    span.in_scope(|| {
        info!(%conn_info, %session, "new connection");
    });
-    let ids = Ids {
-        endpoint_id: node_info.aux.endpoint_id.clone(),
-        branch_id: node_info.aux.branch_id.clone(),
-    };

    let db_user = conn_info.db_and_user();
    tokio::spawn(
@@ -664,7 +661,7 @@ async fn connect_to_compute_once(
    Ok(ClientInner {
        inner: client,
        session: tx,
-        ids,
+        aux: node_info.aux.clone(),
        conn_id,
    })
 }
@@ -672,13 +669,17 @@ async fn connect_to_compute_once(
 struct ClientInner {
    inner: tokio_postgres::Client,
    session: tokio::sync::watch::Sender<uuid::Uuid>,
-    ids: Ids,
+    aux: MetricsAuxInfo,
    conn_id: uuid::Uuid,
 }

 impl Client {
    pub fn metrics(&self) -> Arc<MetricCounter> {
-        USAGE_METRICS.register(self.inner.as_ref().unwrap().ids.clone())
+        let aux = &self.inner.as_ref().unwrap().aux;
+        USAGE_METRICS.register(Ids {
+            endpoint_id: aux.endpoint_id.clone(),
+            branch_id: aux.branch_id.clone(),
+        })
    }
 }

--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -497,6 +497,7 @@ async fn handle_inner(
            }
        };

+    ctx.set_success();
    ctx.log();
    let metrics = client.metrics();

--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -288,34 +288,32 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
 }

 /// Deactivates the timeline and removes its data directory.
-async fn timeline_delete_force_handler(
-    mut request: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
+async fn timeline_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let ttid = TenantTimelineId::new(
        parse_request_param(&request, "tenant_id")?,
        parse_request_param(&request, "timeline_id")?,
    );
+    let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
    check_permission(&request, Some(ttid.tenant_id))?;
    ensure_no_body(&mut request).await?;
    // FIXME: `delete_force` can fail from both internal errors and bad requests. Add better
    // error handling here when we're able to.
-    let resp = GlobalTimelines::delete_force(&ttid)
+    let resp = GlobalTimelines::delete(&ttid, only_local)
        .await
        .map_err(ApiError::InternalServerError)?;
    json_response(StatusCode::OK, resp)
 }

 /// Deactivates all timelines for the tenant and removes its data directory.
-/// See `timeline_delete_force_handler`.
-async fn tenant_delete_force_handler(
-    mut request: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
+/// See `timeline_delete_handler`.
+async fn tenant_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id = parse_request_param(&request, "tenant_id")?;
+    let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
    check_permission(&request, Some(tenant_id))?;
    ensure_no_body(&mut request).await?;
    // FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons;
    // Using an `InternalServerError` should be fixed when the types support it
-    let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id)
+    let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id, only_local)
        .await
        .map_err(ApiError::InternalServerError)?;
    json_response(
@@ -512,10 +510,10 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
            request_span(r, timeline_status_handler)
        })
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            request_span(r, timeline_delete_force_handler)
+            request_span(r, timeline_delete_handler)
        })
        .delete("/v1/tenant/:tenant_id", |r| {
-            request_span(r, tenant_delete_force_handler)
+            request_span(r, tenant_delete_handler)
        })
        .post("/v1/pull_timeline", |r| {
            request_span(r, timeline_pull_handler)
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -88,6 +88,10 @@ impl SafeKeeperConf {
        self.tenant_dir(&ttid.tenant_id)
            .join(ttid.timeline_id.to_string())
    }
+
+    pub fn is_wal_backup_enabled(&self) -> bool {
+        self.remote_storage.is_some() && self.wal_backup_enabled
+    }
 }

 impl SafeKeeperConf {
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -110,7 +110,7 @@ pub static REMOVED_WAL_SEGMENTS: Lazy<IntCounter> = Lazy::new(|| {
 pub static BACKED_UP_SEGMENTS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "safekeeper_backed_up_segments_total",
-        "Number of WAL segments backed up to the broker"
+        "Number of WAL segments backed up to the S3"
    )
    .expect("Failed to register safekeeper_backed_up_segments_total counter")
 });
@@ -337,6 +337,7 @@ pub struct TimelineCollector {
    flushed_wal_seconds: GaugeVec,
    collect_timeline_metrics: Gauge,
    timelines_count: IntGauge,
+    active_timelines_count: IntGauge,
 }

 impl Default for TimelineCollector {
@@ -520,6 +521,13 @@ impl TimelineCollector {
        .unwrap();
        descs.extend(timelines_count.desc().into_iter().cloned());

+        let active_timelines_count = IntGauge::new(
+            "safekeeper_active_timelines",
+            "Total number of active timelines",
+        )
+        .unwrap();
+        descs.extend(active_timelines_count.desc().into_iter().cloned());
+
        TimelineCollector {
            descs,
            commit_lsn,
@@ -540,6 +548,7 @@ impl TimelineCollector {
            flushed_wal_seconds,
            collect_timeline_metrics,
            timelines_count,
+            active_timelines_count,
        }
    }
 }
@@ -572,6 +581,7 @@ impl Collector for TimelineCollector {

        let timelines = GlobalTimelines::get_all();
        let timelines_count = timelines.len();
+        let mut active_timelines_count = 0;

        // Prometheus Collector is sync, and data is stored under async lock. To
        // bridge the gap with a crutch, collect data in spawned thread with
@@ -590,6 +600,10 @@ impl Collector for TimelineCollector {
            let timeline_id = tli.ttid.timeline_id.to_string();
            let labels = &[tenant_id.as_str(), timeline_id.as_str()];

+            if tli.timeline_is_active {
+                active_timelines_count += 1;
+            }
+
            self.commit_lsn
                .with_label_values(labels)
                .set(tli.mem_state.commit_lsn.into());
@@ -681,6 +695,8 @@ impl Collector for TimelineCollector {

        // report total number of timelines
        self.timelines_count.set(timelines_count as i64);
+        self.active_timelines_count
+            .set(active_timelines_count as i64);
        mfs.extend(self.timelines_count.collect());

        mfs
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -10,11 +10,15 @@ use crate::{GlobalTimelines, SafeKeeperConf};
 pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
    let wal_removal_interval = Duration::from_millis(5000);
    loop {
+        let now = tokio::time::Instant::now();
+        let mut active_timelines = 0;
+
        let tlis = GlobalTimelines::get_all();
        for tli in &tlis {
            if !tli.is_active().await {
                continue;
            }
+            active_timelines += 1;
            let ttid = tli.ttid;
            async {
                if let Err(e) = tli.maybe_persist_control_file().await {
@@ -27,6 +31,17 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
            .instrument(info_span!("WAL removal", ttid = %ttid))
            .await;
        }
+
+        let elapsed = now.elapsed();
+        let total_timelines = tlis.len();
+
+        if elapsed > wal_removal_interval {
+            info!(
+                "WAL removal is too long, processed {} active timelines ({} total) in {:?}",
+                active_timelines, total_timelines, elapsed
+            );
+        }
+
        sleep(wal_removal_interval).await;
    }
 }
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -742,6 +742,11 @@ where
                    state.timeline_start_lsn
                );
            }
+            if state.peer_horizon_lsn == Lsn(0) {
+                // Update peer_horizon_lsn as soon as we know where timeline starts.
+                // It means that peer_horizon_lsn cannot be zero after we know timeline_start_lsn.
+                state.peer_horizon_lsn = msg.timeline_start_lsn;
+            }
            if state.local_start_lsn == Lsn(0) {
                state.local_start_lsn = msg.start_streaming_at;
                info!("setting local_start_lsn to {:?}", state.local_start_lsn);
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -407,7 +407,7 @@ impl SafekeeperPostgresHandler {
            self.conf.timeline_dir(&tli.ttid),
            &persisted_state,
            start_pos,
-            self.conf.wal_backup_enabled,
+            self.conf.is_wal_backup_enabled(),
        )?;

        // Split to concurrently receive and send data; replies are generally
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -33,12 +33,13 @@ use crate::safekeeper::{
 };
 use crate::send_wal::WalSenders;
 use crate::state::{TimelineMemState, TimelinePersistentState};
+use crate::wal_backup::{self};
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};

 use crate::metrics::FullTimelineInfo;
 use crate::wal_storage::Storage as wal_storage_iface;
-use crate::SafeKeeperConf;
 use crate::{debug_dump, wal_storage};
+use crate::{GlobalTimelines, SafeKeeperConf};

 /// Things safekeeper should know about timeline state on peers.
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -471,14 +472,29 @@ impl Timeline {
        }
    }

-    /// Delete timeline from disk completely, by removing timeline directory. Background
-    /// timeline activities will stop eventually.
-    pub async fn delete_from_disk(
+    /// Delete timeline from disk completely, by removing timeline directory.
+    /// Background timeline activities will stop eventually.
+    ///
+    /// Also deletes WAL in s3. Might fail if e.g. s3 is unavailable, but
+    /// deletion API endpoint is retriable.
+    pub async fn delete(
        &self,
        shared_state: &mut MutexGuard<'_, SharedState>,
+        only_local: bool,
    ) -> Result<(bool, bool)> {
        let was_active = shared_state.active;
        self.cancel(shared_state);
+
+        // TODO: It's better to wait for s3 offloader termination before
+        // removing data from s3. Though since s3 doesn't have transactions it
+        // still wouldn't guarantee absense of data after removal.
+        let conf = GlobalTimelines::get_global_config();
+        if !only_local && conf.is_wal_backup_enabled() {
+            // Note: we concurrently delete remote storage data from multiple
+            // safekeepers. That's ok, s3 replies 200 if object doesn't exist and we
+            // do some retries anyway.
+            wal_backup::delete_timeline(&self.ttid).await?;
+        }
        let dir_existed = delete_dir(&self.timeline_dir).await?;
        Ok((dir_existed, was_active))
    }
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -327,16 +327,20 @@ impl GlobalTimelines {
    }

    /// Cancels timeline, then deletes the corresponding data directory.
-    pub async fn delete_force(ttid: &TenantTimelineId) -> Result<TimelineDeleteForceResult> {
+    /// If only_local, doesn't remove WAL segments in remote storage.
+    pub async fn delete(
+        ttid: &TenantTimelineId,
+        only_local: bool,
+    ) -> Result<TimelineDeleteForceResult> {
        let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid);
        match tli_res {
            Ok(timeline) => {
                // Take a lock and finish the deletion holding this mutex.
                let mut shared_state = timeline.write_shared_state().await;

-                info!("deleting timeline {}", ttid);
+                info!("deleting timeline {}, only_local={}", ttid, only_local);
                let (dir_existed, was_active) =
-                    timeline.delete_from_disk(&mut shared_state).await?;
+                    timeline.delete(&mut shared_state, only_local).await?;

                // Remove timeline from the map.
                // FIXME: re-enable it once we fix the issue with recreation of deleted timelines
@@ -369,8 +373,11 @@ impl GlobalTimelines {
    /// the tenant had, `true` if a timeline was active. There may be a race if new timelines are
    /// created simultaneously. In that case the function will return error and the caller should
    /// retry tenant deletion again later.
+    ///
+    /// If only_local, doesn't remove WAL segments in remote storage.
    pub async fn delete_force_all_for_tenant(
        tenant_id: &TenantId,
+        only_local: bool,
    ) -> Result<HashMap<TenantTimelineId, TimelineDeleteForceResult>> {
        info!("deleting all timelines for tenant {}", tenant_id);
        let to_delete = Self::get_all_for_tenant(*tenant_id);
@@ -379,7 +386,7 @@ impl GlobalTimelines {

        let mut deleted = HashMap::new();
        for tli in &to_delete {
-            match Self::delete_force(&tli.ttid).await {
+            match Self::delete(&tli.ttid, only_local).await {
                Ok(result) => {
                    deleted.insert(tli.ttid, result);
                }
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -4,6 +4,8 @@ use camino::{Utf8Path, Utf8PathBuf};
 use futures::stream::FuturesOrdered;
 use futures::StreamExt;
 use tokio::task::JoinHandle;
+use tokio_util::sync::CancellationToken;
+use utils::backoff;
 use utils::id::NodeId;

 use std::cmp::min;
@@ -166,6 +168,17 @@ async fn update_task(
    }
 }

+static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
+
+// Storage must be configured and initialized when this is called.
+fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
+    REMOTE_STORAGE
+        .get()
+        .expect("failed to get remote storage")
+        .as_ref()
+        .unwrap()
+}
+
 const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;

 /// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
@@ -199,7 +212,7 @@ pub async fn wal_backup_launcher_task_main(
            ttid = wal_backup_launcher_rx.recv() => {
                // channel is never expected to get closed
                let ttid = ttid.unwrap();
-                if conf.remote_storage.is_none() || !conf.wal_backup_enabled {
+                if !conf.is_wal_backup_enabled() {
                    continue; /* just drain the channel and do nothing */
                }
                async {
@@ -484,18 +497,12 @@ fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec<Segment> {
    res
 }

-static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
-
 async fn backup_object(
    source_file: &Utf8Path,
    target_file: &RemotePath,
    size: usize,
 ) -> Result<()> {
-    let storage = REMOTE_STORAGE
-        .get()
-        .expect("failed to get remote storage")
-        .as_ref()
-        .unwrap();
+    let storage = get_configured_remote_storage();

    let file = File::open(&source_file)
        .await
@@ -532,6 +539,39 @@ pub async fn read_object(
    Ok(Box::pin(reader))
 }

+/// Delete WAL files for the given timeline. Remote storage must be configured
+/// when called.
+pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
+    let storage = get_configured_remote_storage();
+    let ttid_path = Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string());
+    let remote_path = RemotePath::new(&ttid_path)?;
+
+    // A backoff::retry is used here for two reasons:
+    // - To provide a backoff rather than busy-polling the API on errors
+    // - To absorb transient 429/503 conditions without hitting our error
+    //   logging path for issues deleting objects.
+    //
+    // Note: listing segments might take a long time if there are many of them.
+    // We don't currently have http requests timeout cancellation, but if/once
+    // we have listing should get streaming interface to make progress.
+    let token = CancellationToken::new(); // not really used
+    backoff::retry(
+        || async {
+            let files = storage.list_files(Some(&remote_path)).await?;
+            storage.delete_objects(&files).await?;
+            Ok(())
+        },
+        |_| false,
+        3,
+        10,
+        "executing WAL segments deletion batch",
+        backoff::Cancel::new(token, || anyhow::anyhow!("canceled")),
+    )
+    .await?;
+
+    Ok(())
+}
+
 /// Copy segments from one timeline to another. Used in copy_timeline.
 pub async fn copy_s3_segments(
    wal_seg_size: usize,
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -12,9 +12,11 @@ from pathlib import Path
 # Type-related stuff
 from typing import Callable, ClassVar, Dict, Iterator, Optional

+import allure
 import pytest
 from _pytest.config import Config
 from _pytest.config.argparsing import Parser
+from _pytest.fixtures import FixtureRequest
 from _pytest.terminal import TerminalReporter

 from fixtures.log_helper import log
@@ -411,7 +413,10 @@ class NeonBenchmarker:


@pytest.fixture(scope="function")
-def zenbenchmark(record_property: Callable[[str, object], None]) -> Iterator[NeonBenchmarker]:
+def zenbenchmark(
+    request: FixtureRequest,
+    record_property: Callable[[str, object], None],
+) -> Iterator[NeonBenchmarker]:
    """
    This is a python decorator for benchmark fixtures. It contains functions for
    recording measurements, and prints them out at the end.
@@ -419,6 +424,21 @@ def zenbenchmark(record_property: Callable[[str, object], None]) -> Iterator[Neo
    benchmarker = NeonBenchmarker(record_property)
    yield benchmarker

+    results = {}
+    for _, recorded_property in request.node.user_properties:
+        name = recorded_property["name"]
+        value = str(recorded_property["value"])
+        if (unit := recorded_property["unit"].strip()) != "":
+            value += f" {unit}"
+        results[name] = value
+
+    content = json.dumps(results, indent=2)
+    allure.attach(
+        content,
+        "benchmarks.json",
+        allure.attachment_type.JSON,
+    )
+

 def pytest_addoption(parser: Parser):
    parser.addoption(
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -16,6 +16,7 @@ class Metrics:
    def query_all(self, name: str, filter: Optional[Dict[str, str]] = None) -> List[Sample]:
        filter = filter or {}
        res = []
+
        for sample in self.metrics[name]:
            try:
                if all(sample.labels[k] == v for k, v in filter.items()):
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -19,7 +19,7 @@ from functools import cached_property
 from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, cast
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
 from urllib.parse import urlparse

 import asyncpg
@@ -61,7 +61,7 @@ from fixtures.remote_storage import (
    default_remote_storage,
    remote_storage_to_toml_inline_table,
 )
-from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import (
    ATTACHMENT_NAME_REGEX,
    allure_add_grafana_links,
@@ -495,6 +495,8 @@ class NeonEnvBuilder:
        self,
        initial_tenant_conf: Optional[Dict[str, str]] = None,
        default_remote_storage_if_missing: bool = True,
+        initial_tenant_shard_count: Optional[int] = None,
+        initial_tenant_shard_stripe_size: Optional[int] = None,
    ) -> NeonEnv:
        """
        Default way to create and start NeonEnv. Also creates the initial_tenant with root initial_timeline.
@@ -512,7 +514,11 @@ class NeonEnvBuilder:
            f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
        )
        initial_tenant, initial_timeline = env.neon_cli.create_tenant(
-            tenant_id=env.initial_tenant, conf=initial_tenant_conf, timeline_id=env.initial_timeline
+            tenant_id=env.initial_tenant,
+            conf=initial_tenant_conf,
+            timeline_id=env.initial_timeline,
+            shard_count=initial_tenant_shard_count,
+            shard_stripe_size=initial_tenant_shard_stripe_size,
        )
        assert env.initial_tenant == initial_tenant
        assert env.initial_timeline == initial_timeline
@@ -861,7 +867,9 @@ class NeonEnv:

        attachment_service_port = self.port_distributor.get_port()
        self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}"
-        self.attachment_service: NeonAttachmentService = NeonAttachmentService(self)
+        self.attachment_service: NeonAttachmentService = NeonAttachmentService(
+            self, config.auth_enabled
+        )

        # Create a config file corresponding to the options
        cfg: Dict[str, Any] = {
@@ -983,6 +991,16 @@ class NeonEnv:

        raise RuntimeError(f"Pageserver with ID {id} not found")

+    def get_tenant_pageserver(self, tenant_id: Union[TenantId, TenantShardId]):
+        """
+        Get the NeonPageserver where this tenant shard is currently attached, according
+        to the attachment service.
+        """
+        meta = self.attachment_service.inspect(tenant_id)
+        assert meta is not None, f"{tenant_id} attachment location not found"
+        pageserver_id = meta[1]
+        return self.get_pageserver(pageserver_id)
+
    def get_safekeeper_connstrs(self) -> str:
        """Get list of safekeeper endpoints suitable for safekeepers GUC"""
        return ",".join(f"localhost:{wa.port.pg}" for wa in self.safekeepers)
@@ -1226,15 +1244,29 @@ class AbstractNeonCli(abc.ABC):
            env_vars[var] = val

        # Intercept CalledProcessError and print more info
-        res = subprocess.run(
-            args,
-            env=env_vars,
-            check=False,
-            universal_newlines=True,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            timeout=timeout,
-        )
+        try:
+            res = subprocess.run(
+                args,
+                env=env_vars,
+                check=False,
+                universal_newlines=True,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                timeout=timeout,
+            )
+        except subprocess.TimeoutExpired as e:
+            if e.stderr:
+                stderr = e.stderr.decode(errors="replace")
+            else:
+                stderr = ""
+
+            if e.stdout:
+                stdout = e.stdout.decode(errors="replace")
+            else:
+                stdout = ""
+
+            log.warn(f"CLI timeout: stderr={stderr}, stdout={stdout}")
+            raise

        indent = "  "
        if not res.returncode:
@@ -1285,6 +1317,8 @@ class NeonCli(AbstractNeonCli):
        tenant_id: Optional[TenantId] = None,
        timeline_id: Optional[TimelineId] = None,
        conf: Optional[Dict[str, str]] = None,
+        shard_count: Optional[int] = None,
+        shard_stripe_size: Optional[int] = None,
        set_default: bool = False,
    ) -> Tuple[TenantId, TimelineId]:
        """
@@ -1312,6 +1346,12 @@ class NeonCli(AbstractNeonCli):
        if set_default:
            args.append("--set-default")

+        if shard_count is not None:
+            args.extend(["--shard-count", str(shard_count)])
+
+        if shard_stripe_size is not None:
+            args.extend(["--shard-stripe-size", str(shard_stripe_size)])
+
        res = self.raw_cli(args)
        res.check_returncode()
        return tenant_id, timeline_id
@@ -1636,6 +1676,19 @@ class NeonCli(AbstractNeonCli):

        return self.raw_cli(args, check_return_code=True)

+    def tenant_migrate(
+        self, tenant_shard_id: TenantShardId, new_pageserver: int, timeout_secs: Optional[int]
+    ):
+        args = [
+            "tenant",
+            "migrate",
+            "--tenant-id",
+            str(tenant_shard_id),
+            "--id",
+            str(new_pageserver),
+        ]
+        return self.raw_cli(args, check_return_code=True, timeout=timeout_secs)
+
    def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]":
        return self.raw_cli(["start"], check_return_code=check_return_code)

@@ -1684,9 +1737,10 @@ class Pagectl(AbstractNeonCli):


 class NeonAttachmentService:
-    def __init__(self, env: NeonEnv):
+    def __init__(self, env: NeonEnv, auth_enabled):
        self.env = env
        self.running = False
+        self.auth_enabled = auth_enabled

    def start(self):
        assert not self.running
@@ -1700,27 +1754,50 @@ class NeonAttachmentService:
            self.running = False
        return self

-    def attach_hook_issue(self, tenant_id: TenantId, pageserver_id: int) -> int:
-        response = requests.post(
+    def request(self, method, *args, **kwargs) -> requests.Response:
+        kwargs["headers"] = self.headers()
+        return requests.request(method, *args, **kwargs)
+
+    def headers(self) -> Dict[str, str]:
+        headers = {}
+        if self.auth_enabled:
+            jwt_token = self.env.auth_keys.generate_pageserver_token()
+            headers["Authorization"] = f"Bearer {jwt_token}"
+
+        return headers
+
+    def attach_hook_issue(
+        self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
+    ) -> int:
+        response = self.request(
+            "POST",
            f"{self.env.control_plane_api}/attach-hook",
-            json={"tenant_id": str(tenant_id), "node_id": pageserver_id},
+            json={"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id},
+            headers=self.headers(),
        )
        response.raise_for_status()
        gen = response.json()["gen"]
        assert isinstance(gen, int)
        return gen

-    def attach_hook_drop(self, tenant_id: TenantId):
-        response = requests.post(
+    def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]):
+        response = self.request(
+            "POST",
            f"{self.env.control_plane_api}/attach-hook",
-            json={"tenant_id": str(tenant_id), "node_id": None},
+            json={"tenant_shard_id": str(tenant_shard_id), "node_id": None},
+            headers=self.headers(),
        )
        response.raise_for_status()

-    def inspect(self, tenant_id: TenantId) -> Optional[tuple[int, int]]:
-        response = requests.post(
+    def inspect(self, tenant_shard_id: Union[TenantId, TenantShardId]) -> Optional[tuple[int, int]]:
+        """
+        :return: 2-tuple of (generation, pageserver id), or None if unknown
+        """
+        response = self.request(
+            "POST",
            f"{self.env.control_plane_api}/inspect",
-            json={"tenant_id": str(tenant_id)},
+            json={"tenant_shard_id": str(tenant_shard_id)},
+            headers=self.headers(),
        )
        response.raise_for_status()
        json = response.json()
@@ -1731,6 +1808,79 @@ class NeonAttachmentService:
        else:
            return None

+    def node_register(self, node: NeonPageserver):
+        body = {
+            "node_id": int(node.id),
+            "listen_http_addr": "localhost",
+            "listen_http_port": node.service_port.http,
+        }
+        log.info(f"node_register({body})")
+        self.request(
+            "POST", f"{self.env.control_plane_api}/node", json=body, headers=self.headers()
+        ).raise_for_status()
+
+    def tenant_create(
+        self,
+        tenant_id: TenantId,
+        shard_count: Optional[int] = None,
+        shard_stripe_size: Optional[int] = None,
+        tenant_config: Optional[Dict[Any, Any]] = None,
+    ):
+        body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}
+
+        if shard_count is not None:
+            shard_params = {"count": shard_count}
+            if shard_stripe_size is not None:
+                shard_params["stripe_size"] = shard_stripe_size
+
+            body["shard_parameters"] = shard_params
+
+        if tenant_config is not None:
+            for k, v in tenant_config.items():
+                body[k] = v
+
+        response = self.request("POST", f"{self.env.control_plane_api}/tenant", json=body)
+        response.raise_for_status()
+        log.info(f"tenant_create success: {response.json()}")
+
+    def tenant_timeline_create(self, tenant_id: TenantId, timeline_id: TimelineId):
+        body: Dict[str, Any] = {"new_timeline_id": str(timeline_id)}
+
+        response = self.request(
+            "POST", f"{self.env.control_plane_api}/tenant/{tenant_id}/timeline", json=body
+        )
+        response.raise_for_status()
+        log.info(f"tenant_timeline_create success: {response.json()}")
+
+    def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
+        response = self.request("GET", f"{self.env.control_plane_api}/tenant/{tenant_id}/locate")
+        response.raise_for_status()
+        body = response.json()
+        shards: list[dict[str, Any]] = body["shards"]
+        return shards
+
+    def tenant_shard_split(self, tenant_id: TenantId, shard_count: int) -> list[TenantShardId]:
+        response = self.request(
+            "PUT",
+            f"{self.env.control_plane_api}/tenant/{tenant_id}/shard_split",
+            json={"new_shard_count": shard_count},
+        )
+        response.raise_for_status()
+        body = response.json()
+        log.info(f"tenant_shard_split success: {body}")
+        shards: list[TenantShardId] = body["new_shards"]
+        return shards
+
+    def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int):
+        response = self.request(
+            "PUT",
+            f"{self.env.control_plane_api}/tenant/{tenant_shard_id}/migrate",
+            json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id},
+        )
+        response.raise_for_status()
+        log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}")
+        assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id
+
    def __enter__(self) -> "NeonAttachmentService":
        return self

@@ -2764,6 +2914,7 @@ class Endpoint(PgProtocol):

        # Write it back updated
        with open(config_path, "w") as file:
+            log.info(json.dumps(dict(data_dict, **kwargs)))
            json.dump(dict(data_dict, **kwargs), file, indent=4)

    # Mock the extension part of spec passed from control plane for local testing
@@ -2831,7 +2982,7 @@ class Endpoint(PgProtocol):
            hot_standby=hot_standby,
            lsn=lsn,
            pageserver_id=pageserver_id,
-        ).start(remote_ext_config=remote_ext_config)
+        ).start(remote_ext_config=remote_ext_config, pageserver_id=pageserver_id)

        log.info(f"Postgres startup took {time.time() - started_at} seconds")

@@ -3202,9 +3353,15 @@ class SafekeeperHttpClient(requests.Session):
        )
        res.raise_for_status()

-    def timeline_delete_force(self, tenant_id: TenantId, timeline_id: TimelineId) -> Dict[Any, Any]:
+    # only_local doesn't remove segments in the remote storage.
+    def timeline_delete(
+        self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
+    ) -> Dict[Any, Any]:
        res = self.delete(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}"
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
+            params={
+                "only_local": str(only_local).lower(),
+            },
        )
        res.raise_for_status()
        res_json = res.json()
@@ -3344,7 +3501,7 @@ def pytest_addoption(parser: Parser):


 SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile(  # type: ignore[type-arg]
-    r"config|config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql)"
+    r"config|config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql|conf)"
 )


@@ -3481,9 +3638,7 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:


 # pg is the existing and running compute node, that we want to compare with a basebackup
-def check_restored_datadir_content(
-    test_output_dir: Path, env: NeonEnv, endpoint: Endpoint, pageserver_id: Optional[int] = None
-):
+def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
    # Get the timeline ID. We need it for the 'basebackup' command
    timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])

@@ -3504,6 +3659,7 @@ def check_restored_datadir_content(
    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
    psql_path = os.path.join(pg_bin.pg_bin_path, "psql")

+    pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"]
    cmd = rf"""
        {psql_path}                                    \
            --no-psqlrc                                \
@@ -3572,6 +3728,38 @@ def logical_replication_sync(subscriber: VanillaPostgres, publisher: Endpoint) -
        time.sleep(0.5)


+def tenant_get_shards(
+    env: NeonEnv, tenant_id: TenantId, pageserver_id: Optional[int]
+) -> list[tuple[TenantShardId, NeonPageserver]]:
+    """
+    Helper for when you want to talk to one or more pageservers, and the
+    caller _might_ have specified a pageserver, or they might leave it to
+    us to figure out the shards for a tenant.
+
+    If the caller provides `pageserver_id`, it will be used for all shards, even
+    if the shard is indicated by attachment service to be on some other pageserver.
+
+    Caller should over the response to apply their per-pageserver action to
+    each shard
+    """
+    if pageserver_id is not None:
+        override_pageserver = [p for p in env.pageservers if p.id == pageserver_id][0]
+    else:
+        override_pageserver = None
+
+    if len(env.pageservers) > 1:
+        return [
+            (
+                TenantShardId.parse(s["shard_id"]),
+                override_pageserver or env.get_pageserver(s["node_id"]),
+            )
+            for s in env.attachment_service.locate(tenant_id)
+        ]
+    else:
+        # Assume an unsharded tenant
+        return [(TenantShardId(tenant_id, 0, 0), override_pageserver or env.pageserver)]
+
+
 def wait_for_last_flush_lsn(
    env: NeonEnv,
    endpoint: Endpoint,
@@ -3581,10 +3769,24 @@ def wait_for_last_flush_lsn(
 ) -> Lsn:
    """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""

+    shards = tenant_get_shards(env, tenant, pageserver_id)
+
    last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
-    return wait_for_last_record_lsn(
-        env.get_pageserver(pageserver_id).http_client(), tenant, timeline, last_flush_lsn
-    )
+
+    results = []
+    for tenant_shard_id, pageserver in shards:
+        log.info(
+            f"wait_for_last_flush_lsn: waiting for {last_flush_lsn} on shard {tenant_shard_id} on pageserver {pageserver.id})"
+        )
+        waited = wait_for_last_record_lsn(
+            pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn
+        )
+
+        assert waited >= last_flush_lsn
+        results.append(waited)
+
+    # Return the lowest LSN that has been ingested by all shards
+    return min(results)


 def wait_for_wal_insert_lsn(
@@ -3596,9 +3798,16 @@ def wait_for_wal_insert_lsn(
 ) -> Lsn:
    """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""
    last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_insert_lsn()")[0][0])
-    return wait_for_last_record_lsn(
-        env.get_pageserver(pageserver_id).http_client(), tenant, timeline, last_flush_lsn
-    )
+    result = None
+    for tenant_shard_id, pageserver in tenant_get_shards(env, tenant, pageserver_id):
+        shard_r = wait_for_last_record_lsn(
+            pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn
+        )
+        if result is None:
+            result = shard_r
+
+    assert result is not None
+    return result


 def fork_at_current_lsn(
@@ -3632,11 +3841,13 @@ def last_flush_lsn_upload(
    last_flush_lsn = wait_for_last_flush_lsn(
        env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver_id
    )
-    ps_http = env.get_pageserver(pageserver_id).http_client()
-    wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, last_flush_lsn)
-    # force a checkpoint to trigger upload
-    ps_http.timeline_checkpoint(tenant_id, timeline_id)
-    wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
+    shards = tenant_get_shards(env, tenant_id, pageserver_id)
+    for tenant_shard_id, pageserver in shards:
+        ps_http = pageserver.http_client()
+        wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, last_flush_lsn)
+        # force a checkpoint to trigger upload
+        ps_http.timeline_checkpoint(tenant_shard_id, timeline_id)
+        wait_for_upload(ps_http, tenant_shard_id, timeline_id, last_flush_lsn)
    return last_flush_lsn


--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -4,7 +4,7 @@ import json
 import time
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Set, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple, Union

 import requests
 from requests.adapters import HTTPAdapter
@@ -13,7 +13,7 @@ from urllib3.util.retry import Retry
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, parse_metrics
 from fixtures.pg_version import PgVersion
-from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import Fn


@@ -211,7 +211,7 @@ class PageserverHttpClient(requests.Session):

    def tenant_create(
        self,
-        new_tenant_id: TenantId,
+        new_tenant_id: Union[TenantId, TenantShardId],
        conf: Optional[Dict[str, Any]] = None,
        generation: Optional[int] = None,
    ) -> TenantId:
@@ -239,7 +239,7 @@ class PageserverHttpClient(requests.Session):

    def tenant_attach(
        self,
-        tenant_id: TenantId,
+        tenant_id: Union[TenantId, TenantShardId],
        config: None | Dict[str, Any] = None,
        config_null: bool = False,
        generation: Optional[int] = None,
@@ -269,7 +269,7 @@ class PageserverHttpClient(requests.Session):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
        self.verbose_error(res)

-    def tenant_reset(self, tenant_id: TenantId, drop_cache: bool):
+    def tenant_reset(self, tenant_id: Union[TenantId, TenantShardId], drop_cache: bool):
        params = {}
        if drop_cache:
            params["drop_cache"] = "true"
@@ -278,7 +278,7 @@ class PageserverHttpClient(requests.Session):
        self.verbose_error(res)

    def tenant_location_conf(
-        self, tenant_id: TenantId, location_conf=dict[str, Any], flush_ms=None
+        self, tenant_id: Union[TenantId, TenantShardId], location_conf=dict[str, Any], flush_ms=None
    ):
        body = location_conf.copy()
        body["tenant_id"] = str(tenant_id)
@@ -294,7 +294,7 @@ class PageserverHttpClient(requests.Session):
        )
        self.verbose_error(res)

-    def tenant_delete(self, tenant_id: TenantId):
+    def tenant_delete(self, tenant_id: Union[TenantId, TenantShardId]):
        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
        self.verbose_error(res)
        return res
@@ -310,27 +310,27 @@ class PageserverHttpClient(requests.Session):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore")
        self.verbose_error(res)

-    def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]:
+    def tenant_status(self, tenant_id: Union[TenantId, TenantShardId]) -> Dict[Any, Any]:
        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
        self.verbose_error(res)
        res_json = res.json()
        assert isinstance(res_json, dict)
        return res_json

-    def tenant_config(self, tenant_id: TenantId) -> TenantConfig:
+    def tenant_config(self, tenant_id: Union[TenantId, TenantShardId]) -> TenantConfig:
        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/config")
        self.verbose_error(res)
        return TenantConfig.from_json(res.json())

-    def tenant_heatmap_upload(self, tenant_id: TenantId):
+    def tenant_heatmap_upload(self, tenant_id: Union[TenantId, TenantShardId]):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload")
        self.verbose_error(res)

-    def tenant_secondary_download(self, tenant_id: TenantId):
+    def tenant_secondary_download(self, tenant_id: Union[TenantId, TenantShardId]):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download")
        self.verbose_error(res)

-    def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
+    def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]):
        assert "tenant_id" not in config.keys()
        res = self.put(
            f"http://localhost:{self.port}/v1/tenant/config",
@@ -352,10 +352,12 @@ class PageserverHttpClient(requests.Session):
                del current[key]
        self.set_tenant_config(tenant_id, current)

-    def tenant_size(self, tenant_id: TenantId) -> int:
+    def tenant_size(self, tenant_id: Union[TenantId, TenantShardId]) -> int:
        return self.tenant_size_and_modelinputs(tenant_id)[0]

-    def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[str, Any]]:
+    def tenant_size_and_modelinputs(
+        self, tenant_id: Union[TenantId, TenantShardId]
+    ) -> Tuple[int, Dict[str, Any]]:
        """
        Returns the tenant size, together with the model inputs as the second tuple item.
        """
@@ -370,7 +372,7 @@ class PageserverHttpClient(requests.Session):
        assert isinstance(inputs, dict)
        return (size, inputs)

-    def tenant_size_debug(self, tenant_id: TenantId) -> str:
+    def tenant_size_debug(self, tenant_id: Union[TenantId, TenantShardId]) -> str:
        """
        Returns the tenant size debug info, as an HTML string
        """
@@ -382,7 +384,7 @@ class PageserverHttpClient(requests.Session):

    def timeline_list(
        self,
-        tenant_id: TenantId,
+        tenant_id: Union[TenantId, TenantShardId],
        include_non_incremental_logical_size: bool = False,
        include_timeline_dir_layer_file_size_sum: bool = False,
    ) -> List[Dict[str, Any]]:
@@ -403,7 +405,7 @@ class PageserverHttpClient(requests.Session):
    def timeline_create(
        self,
        pg_version: PgVersion,
-        tenant_id: TenantId,
+        tenant_id: Union[TenantId, TenantShardId],
        new_timeline_id: TimelineId,
        ancestor_timeline_id: Optional[TimelineId] = None,
        ancestor_start_lsn: Optional[Lsn] = None,
@@ -437,7 +439,7 @@ class PageserverHttpClient(requests.Session):

    def timeline_detail(
        self,
-        tenant_id: TenantId,
+        tenant_id: Union[TenantId, TenantShardId],
        timeline_id: TimelineId,
        include_non_incremental_logical_size: bool = False,
        include_timeline_dir_layer_file_size_sum: bool = False,
@@ -462,7 +464,9 @@ class PageserverHttpClient(requests.Session):
        assert isinstance(res_json, dict)
        return res_json

-    def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId, **kwargs):
+    def timeline_delete(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, **kwargs
+    ):
        """
        Note that deletion is not instant, it is scheduled and performed mostly in the background.
        So if you need to wait for it to complete use `timeline_delete_wait_completed`.
@@ -476,7 +480,10 @@ class PageserverHttpClient(requests.Session):
        assert res_json is None

    def timeline_gc(
-        self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int]
+        self,
+        tenant_id: Union[TenantId, TenantShardId],
+        timeline_id: TimelineId,
+        gc_horizon: Optional[int],
    ) -> dict[str, Any]:
        """
        Unlike most handlers, this will wait for the layers to be actually
@@ -499,7 +506,10 @@ class PageserverHttpClient(requests.Session):
        return res_json

    def timeline_compact(
-        self, tenant_id: TenantId, timeline_id: TimelineId, force_repartition=False
+        self,
+        tenant_id: Union[TenantId, TenantShardId],
+        timeline_id: TimelineId,
+        force_repartition=False,
    ):
        self.is_testing_enabled_or_skip()
        query = {}
@@ -518,7 +528,7 @@ class PageserverHttpClient(requests.Session):

    def timeline_get_lsn_by_timestamp(
        self,
-        tenant_id: TenantId,
+        tenant_id: Union[TenantId, TenantShardId],
        timeline_id: TimelineId,
        timestamp,
        version: Optional[int] = None,
@@ -537,7 +547,9 @@ class PageserverHttpClient(requests.Session):
        res_json = res.json()
        return res_json

-    def timeline_get_timestamp_of_lsn(self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn):
+    def timeline_get_timestamp_of_lsn(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn
+    ):
        log.info(f"Requesting time range of lsn {lsn}, tenant {tenant_id}, timeline {timeline_id}")
        res = self.get(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn?lsn={lsn}",
@@ -547,7 +559,10 @@ class PageserverHttpClient(requests.Session):
        return res_json

    def timeline_checkpoint(
-        self, tenant_id: TenantId, timeline_id: TimelineId, force_repartition=False
+        self,
+        tenant_id: Union[TenantId, TenantShardId],
+        timeline_id: TimelineId,
+        force_repartition=False,
    ):
        self.is_testing_enabled_or_skip()
        query = {}
@@ -566,7 +581,7 @@ class PageserverHttpClient(requests.Session):

    def timeline_spawn_download_remote_layers(
        self,
-        tenant_id: TenantId,
+        tenant_id: Union[TenantId, TenantShardId],
        timeline_id: TimelineId,
        max_concurrent_downloads: int,
    ) -> dict[str, Any]:
@@ -585,7 +600,7 @@ class PageserverHttpClient(requests.Session):

    def timeline_poll_download_remote_layers_status(
        self,
-        tenant_id: TenantId,
+        tenant_id: Union[TenantId, TenantShardId],
        timeline_id: TimelineId,
        spawn_response: dict[str, Any],
        poll_state=None,
@@ -607,7 +622,7 @@ class PageserverHttpClient(requests.Session):

    def timeline_download_remote_layers(
        self,
-        tenant_id: TenantId,
+        tenant_id: Union[TenantId, TenantShardId],
        timeline_id: TimelineId,
        max_concurrent_downloads: int,
        errors_ok=False,
@@ -689,9 +704,37 @@ class PageserverHttpClient(requests.Session):
        assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
        return results[0].value

+    def get_metrics_values(
+        self, names: list[str], filter: Optional[Dict[str, str]] = None
+    ) -> Dict[str, float]:
+        """
+        When fetching multiple named metrics, it is more efficient to use this
+        than to call `get_metric_value` repeatedly.
+
+        Throws RuntimeError if no metrics matching `names` are found, or if
+        not all of `names` are found: this method is intended for loading sets
+        of metrics whose existence is coupled.
+        """
+        metrics = self.get_metrics()
+        samples = []
+        for name in names:
+            samples.extend(metrics.query_all(name, filter=filter))
+
+        result = {}
+        for sample in samples:
+            if sample.name in result:
+                raise RuntimeError(f"Multiple values found for {sample.name}")
+            result[sample.name] = sample.value
+
+        if len(result) != len(names):
+            log.info(f"Metrics found: {metrics.metrics}")
+            raise RuntimeError(f"could not find all metrics {' '.join(names)}")
+
+        return result
+
    def layer_map_info(
        self,
-        tenant_id: TenantId,
+        tenant_id: Union[TenantId, TenantShardId],
        timeline_id: TimelineId,
    ) -> LayerMapInfo:
        res = self.get(
@@ -700,7 +743,9 @@ class PageserverHttpClient(requests.Session):
        self.verbose_error(res)
        return LayerMapInfo.from_json(res.json())

-    def download_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str):
+    def download_layer(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
+    ):
        res = self.get(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}",
        )
@@ -708,14 +753,18 @@ class PageserverHttpClient(requests.Session):

        assert res.status_code == 200

-    def download_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId):
+    def download_all_layers(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    ):
        info = self.layer_map_info(tenant_id, timeline_id)
        for layer in info.historic_layers:
            if not layer.remote:
                continue
            self.download_layer(tenant_id, timeline_id, layer.layer_file_name)

-    def evict_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str):
+    def evict_layer(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
+    ):
        res = self.delete(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}",
        )
@@ -723,7 +772,7 @@ class PageserverHttpClient(requests.Session):

        assert res.status_code in (200, 304)

-    def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId):
+    def evict_all_layers(self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId):
        info = self.layer_map_info(tenant_id, timeline_id)
        for layer in info.historic_layers:
            self.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
@@ -736,7 +785,7 @@ class PageserverHttpClient(requests.Session):
        self.verbose_error(res)
        return res.json()

-    def tenant_break(self, tenant_id: TenantId):
+    def tenant_break(self, tenant_id: Union[TenantId, TenantShardId]):
        res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break")
        self.verbose_error(res)

--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -1,12 +1,12 @@
 import time
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union

 from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef, ObjectTypeDef

 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
-from fixtures.remote_storage import RemoteStorageKind, S3Storage
-from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage
+from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import wait_until


@@ -22,7 +22,9 @@ def assert_tenant_state(


 def remote_consistent_lsn(
-    pageserver_http: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
+    pageserver_http: PageserverHttpClient,
+    tenant: Union[TenantId, TenantShardId],
+    timeline: TimelineId,
 ) -> Lsn:
    detail = pageserver_http.timeline_detail(tenant, timeline)

@@ -39,7 +41,7 @@ def remote_consistent_lsn(

 def wait_for_upload(
    pageserver_http: PageserverHttpClient,
-    tenant: TenantId,
+    tenant: Union[TenantId, TenantShardId],
    timeline: TimelineId,
    lsn: Lsn,
 ):
@@ -92,7 +94,7 @@ def wait_until_tenant_state(

 def wait_until_timeline_state(
    pageserver_http: PageserverHttpClient,
-    tenant_id: TenantId,
+    tenant_id: Union[TenantId, TenantShardId],
    timeline_id: TimelineId,
    expected_state: str,
    iterations: int,
@@ -141,7 +143,9 @@ def wait_until_tenant_active(


 def last_record_lsn(
-    pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
+    pageserver_http_client: PageserverHttpClient,
+    tenant: Union[TenantId, TenantShardId],
+    timeline: TimelineId,
 ) -> Lsn:
    detail = pageserver_http_client.timeline_detail(tenant, timeline)

@@ -152,7 +156,7 @@ def last_record_lsn(

 def wait_for_last_record_lsn(
    pageserver_http: PageserverHttpClient,
-    tenant: TenantId,
+    tenant: Union[TenantId, TenantShardId],
    timeline: TimelineId,
    lsn: Lsn,
 ) -> Lsn:
@@ -194,7 +198,7 @@ def wait_for_upload_queue_empty(

 def wait_timeline_detail_404(
    pageserver_http: PageserverHttpClient,
-    tenant_id: TenantId,
+    tenant_id: Union[TenantId, TenantShardId],
    timeline_id: TimelineId,
    iterations: int,
    interval: Optional[float] = None,
@@ -219,7 +223,7 @@ def wait_timeline_detail_404(

 def timeline_delete_wait_completed(
    pageserver_http: PageserverHttpClient,
-    tenant_id: TenantId,
+    tenant_id: Union[TenantId, TenantShardId],
    timeline_id: TimelineId,
    iterations: int = 20,
    interval: Optional[float] = None,
@@ -229,23 +233,18 @@ def timeline_delete_wait_completed(
    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval)


-if TYPE_CHECKING:
-    # TODO avoid by combining remote storage related stuff in single type
-    # and just passing in this type instead of whole builder
-    from fixtures.neon_fixtures import NeonEnvBuilder
-
-
+# remote_storage must not be None, but that's easier for callers to make mypy happy
 def assert_prefix_empty(
-    neon_env_builder: "NeonEnvBuilder",
+    remote_storage: Optional[RemoteStorage],
    prefix: Optional[str] = None,
    allowed_postfix: Optional[str] = None,
 ):
-    response = list_prefix(neon_env_builder, prefix)
+    assert remote_storage is not None
+    response = list_prefix(remote_storage, prefix)
    keys = response["KeyCount"]
    objects: List[ObjectTypeDef] = response.get("Contents", [])
    common_prefixes = response.get("CommonPrefixes", [])

-    remote_storage = neon_env_builder.pageserver_remote_storage
    is_mock_s3 = isinstance(remote_storage, S3Storage) and not remote_storage.cleanup

    if is_mock_s3:
@@ -279,19 +278,20 @@ def assert_prefix_empty(
    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"


-def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
-    response = list_prefix(neon_env_builder, prefix)
+# remote_storage must not be None, but that's easier for callers to make mypy happy
+def assert_prefix_not_empty(remote_storage: Optional[RemoteStorage], prefix: Optional[str] = None):
+    assert remote_storage is not None
+    response = list_prefix(remote_storage, prefix)
    assert response["KeyCount"] != 0, f"remote dir with prefix {prefix} is empty: {response}"


 def list_prefix(
-    neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None, delimiter: str = "/"
+    remote: RemoteStorage, prefix: Optional[str] = None, delimiter: str = "/"
 ) -> ListObjectsV2OutputTypeDef:
    """
    Note that this function takes into account prefix_in_bucket.
    """
    # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
-    remote = neon_env_builder.pageserver_remote_storage
    assert isinstance(remote, S3Storage), "localfs is currently not supported"
    assert remote.client is not None

--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -5,6 +5,7 @@ from fixtures.neon_fixtures import (
    Endpoint,
    NeonEnv,
    last_flush_lsn_upload,
+    tenant_get_shards,
    wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
@@ -31,7 +32,7 @@ class Workload:

        self._endpoint: Optional[Endpoint] = None

-    def endpoint(self, pageserver_id: int) -> Endpoint:
+    def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint:
        if self._endpoint is None:
            self._endpoint = self.env.endpoints.create(
                "main",
@@ -54,7 +55,7 @@ class Workload:
        if self._endpoint is not None:
            self._endpoint.stop()

-    def init(self, pageserver_id: int):
+    def init(self, pageserver_id: Optional[int] = None):
        endpoint = self.endpoint(pageserver_id)

        endpoint.safe_psql(f"CREATE TABLE {self.table} (id INTEGER PRIMARY KEY, val text);")
@@ -63,7 +64,7 @@ class Workload:
            self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
        )

-    def write_rows(self, n, pageserver_id):
+    def write_rows(self, n, pageserver_id: Optional[int] = None):
        endpoint = self.endpoint(pageserver_id)
        start = self.expect_rows
        end = start + n - 1
@@ -81,7 +82,7 @@ class Workload:
            self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
        )

-    def churn_rows(self, n, pageserver_id, upload=True):
+    def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True):
        assert self.expect_rows >= n

        max_iters = 10
@@ -119,21 +120,24 @@ class Workload:
                ]
            )

-        last_flush_lsn = wait_for_last_flush_lsn(
-            self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
-        )
-        ps_http = self.env.get_pageserver(pageserver_id).http_client()
-        wait_for_last_record_lsn(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn)
+        for tenant_shard_id, pageserver in tenant_get_shards(
+            self.env, self.tenant_id, pageserver_id
+        ):
+            last_flush_lsn = wait_for_last_flush_lsn(
+                self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
+            )
+            ps_http = pageserver.http_client()
+            wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)

-        if upload:
-            # force a checkpoint to trigger upload
-            ps_http.timeline_checkpoint(self.tenant_id, self.timeline_id)
-            wait_for_upload(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn)
-            log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
-        else:
-            log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
+            if upload:
+                # force a checkpoint to trigger upload
+                ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id)
+                wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
+                log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
+            else:
+                log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")

-    def validate(self, pageserver_id):
+    def validate(self, pageserver_id: Optional[int] = None):
        endpoint = self.endpoint(pageserver_id)
        result = endpoint.safe_psql_many(
            [
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -61,7 +61,7 @@ def measure_recovery_time(env: NeonCompare):
    # of view, but the same as far as the safekeeper/WAL is concerned.  To work around that,
    # we will explicitly create the tenant in the same generation that it was previously
    # attached in.
-    attach_status = env.env.attachment_service.inspect(tenant_id=env.tenant)
+    attach_status = env.env.attachment_service.inspect(tenant_shard_id=env.tenant)
    assert attach_status is not None
    (attach_gen, _) = attach_status

--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -10,6 +10,7 @@ from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    wait_for_last_flush_lsn,
 )
+from fixtures.pg_version import PgVersion
 from fixtures.types import TenantId, TimelineId


@@ -126,7 +127,7 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder)
    # Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed.
    pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return"))
    with pytest.raises(Exception, match="before-checkpoint-new-timeline"):
-        _ = env.neon_cli.create_timeline("test_timeline_init_break_before_checkpoint", tenant_id)
+        _ = pageserver_http.timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate())

    # Restart the page server
    env.pageserver.restart(immediate=True)
@@ -160,7 +161,7 @@ def test_timeline_init_break_before_checkpoint_recreate(
        ]
    )

-    env.pageserver.tenant_create(env.initial_tenant)
+    env.neon_cli.create_tenant(env.initial_tenant)
    tenant_id = env.initial_tenant

    timelines_dir = env.pageserver.timeline_dir(tenant_id)
@@ -216,7 +217,7 @@ def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilde
    # Introduce failpoint when creating a new timeline uninit mark, before any other files were created
    pageserver_http.configure_failpoints(("after-timeline-uninit-mark-creation", "return"))
    with pytest.raises(Exception, match="after-timeline-uninit-mark-creation"):
-        _ = env.neon_cli.create_timeline("test_timeline_create_break_after_uninit_mark", tenant_id)
+        _ = pageserver_http.timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate())

    # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
    # "New" timeline is not present in the list, allowing pageserver to retry the same request
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -248,8 +248,15 @@ def test_ddl_forwarding(ddl: DdlForwardingContext):
    # We don't have compute_ctl, so here, so create neon_superuser here manually
    cur.execute("CREATE ROLE neon_superuser NOLOGIN CREATEDB CREATEROLE")

-    with pytest.raises(psycopg2.InternalError):
-        cur.execute("ALTER ROLE neon_superuser LOGIN")
+    # Contrary to popular belief, being superman does not make you superuser
+    cur.execute("CREATE ROLE superman LOGIN NOSUPERUSER PASSWORD 'jungle_man'")
+
+    with ddl.pg.cursor(user="superman", password="jungle_man") as superman_cur:
+        # We allow real SUPERUSERs to ALTER neon_superuser
+        with pytest.raises(psycopg2.InternalError):
+            superman_cur.execute("ALTER ROLE neon_superuser LOGIN")
+
+    cur.execute("ALTER ROLE neon_superuser LOGIN")

    with pytest.raises(psycopg2.InternalError):
        cur.execute("CREATE DATABASE trololobus WITH OWNER neon_superuser")
--- a/Show More
+++ b/Show More