diff --git a/.dockerignore b/.dockerignore
index 8b378b5dab..f7a6232ba1 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -22,6 +22,7 @@
 !s3_scrubber/
 !safekeeper/
 !storage_broker/
+!storage_controller/
 !trace/
 !vendor/postgres-*/
 !workspace_hack/
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 2e56bf909f..1eaf05cd54 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -147,15 +147,16 @@ jobs:
             "neonvm-captest-new"
           ],
           "db_size": [ "10gb" ],
-          "include": [{ "platform": "neon-captest-freetier",   "db_size": "3gb"  },
-                      { "platform": "neon-captest-new",        "db_size": "50gb" },
-                      { "platform": "neonvm-captest-freetier", "db_size": "3gb"  },
-                      { "platform": "neonvm-captest-new",      "db_size": "50gb" }]
+          "include": [{ "platform": "neon-captest-freetier",         "db_size": "3gb"  },
+                      { "platform": "neon-captest-new",              "db_size": "50gb" },
+                      { "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
+                      { "platform": "neonvm-captest-new",            "db_size": "50gb" },
+                      { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
         }'
 
         if [ "$(date +%A)" = "Saturday" ]; then
           matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
-                                                   { "platform": "rds-aurora",   "db_size": "50gb"}]')
+                                                     { "platform": "rds-aurora",   "db_size": "50gb"}]')
         fi
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -171,7 +172,7 @@ jobs:
 
         if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
           matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
-                                                   { "platform": "rds-aurora"   }]')
+                                                     { "platform": "rds-aurora"   }]')
         fi
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -190,7 +191,7 @@ jobs:
 
         if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
           matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
-                                                    { "platform": "rds-aurora",   "scale": "10" }]')
+                                                     { "platform": "rds-aurora",   "scale": "10" }]')
         fi
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -253,6 +254,9 @@ jobs:
           neon-captest-reuse)
             CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
             ;;
+          neonvm-captest-sharding-reuse)
+            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
+            ;;
           neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
             CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
             ;;
@@ -270,11 +274,15 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERY="SELECT version();"
+        QUERIES=("SELECT version()")
         if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+          QUERIES+=("SHOW neon.tenant_id")
+          QUERIES+=("SHOW neon.timeline_id")
         fi
-        psql ${CONNSTR} -c "${QUERY}"
+
+        for q in "${QUERIES[@]}"; do
+          psql ${CONNSTR} -c "${q}"
+        done
 
     - name: Benchmark init
       uses: ./.github/actions/run-python-test-set
@@ -401,11 +409,15 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERY="SELECT version();"
+        QUERIES=("SELECT version()")
         if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+          QUERIES+=("SHOW neon.tenant_id")
+          QUERIES+=("SHOW neon.timeline_id")
         fi
-        psql ${CONNSTR} -c "${QUERY}"
+
+        for q in "${QUERIES[@]}"; do
+          psql ${CONNSTR} -c "${q}"
+        done
 
     - name: ClickBench benchmark
       uses: ./.github/actions/run-python-test-set
@@ -507,11 +519,15 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERY="SELECT version();"
+        QUERIES=("SELECT version()")
         if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+          QUERIES+=("SHOW neon.tenant_id")
+          QUERIES+=("SHOW neon.timeline_id")
         fi
-        psql ${CONNSTR} -c "${QUERY}"
+
+        for q in "${QUERIES[@]}"; do
+          psql ${CONNSTR} -c "${q}"
+        done
 
     - name: Run TPC-H benchmark
       uses: ./.github/actions/run-python-test-set
@@ -597,11 +613,15 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERY="SELECT version();"
+        QUERIES=("SELECT version()")
         if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+          QUERIES+=("SHOW neon.tenant_id")
+          QUERIES+=("SHOW neon.timeline_id")
         fi
-        psql ${CONNSTR} -c "${QUERY}"
+
+        for q in "${QUERIES[@]}"; do
+          psql ${CONNSTR} -c "${q}"
+        done
 
     - name: Run user examples
       uses: ./.github/actions/run-python-test-set
diff --git a/CODEOWNERS b/CODEOWNERS
index 9a23e8c958..af2fa6088e 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,5 +1,5 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
-/control_plane/attachment_service @neondatabase/storage
+/storage_controller @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/storage
 /libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
 /libs/remote_storage/ @neondatabase/storage
diff --git a/Cargo.lock b/Cargo.lock
index c1c245fa9c..dae406e4ae 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -270,44 +270,6 @@ dependencies = [
  "critical-section",
 ]
 
-[[package]]
-name = "attachment_service"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "aws-config",
- "bytes",
- "camino",
- "clap",
- "control_plane",
- "diesel",
- "diesel_migrations",
- "fail",
- "futures",
- "git-version",
- "hex",
- "humantime",
- "hyper",
- "lasso",
- "measured",
- "metrics",
- "once_cell",
- "pageserver_api",
- "pageserver_client",
- "postgres_connection",
- "r2d2",
- "reqwest",
- "routerify",
- "serde",
- "serde_json",
- "thiserror",
- "tokio",
- "tokio-util",
- "tracing",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -2234,9 +2196,9 @@ dependencies = [
 
 [[package]]
 name = "h2"
-version = "0.3.24"
+version = "0.3.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb2c4422095b67ee78da96fbb51a4cc413b3b25883c7717ff7ca1ab31022c9c9"
+checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8"
 dependencies = [
  "bytes",
  "fnv",
@@ -3435,9 +3397,9 @@ dependencies = [
 
 [[package]]
 name = "ordered-multimap"
-version = "0.7.1"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f"
+checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
 dependencies = [
  "dlv-list",
  "hashbrown 0.14.0",
@@ -4199,6 +4161,7 @@ name = "proxy"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "async-compression",
  "async-trait",
  "aws-config",
  "aws-sdk-iam",
@@ -5621,6 +5584,65 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "storage_controller"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "aws-config",
+ "bytes",
+ "camino",
+ "clap",
+ "control_plane",
+ "diesel",
+ "diesel_migrations",
+ "fail",
+ "futures",
+ "git-version",
+ "hex",
+ "humantime",
+ "hyper",
+ "itertools",
+ "lasso",
+ "measured",
+ "metrics",
+ "once_cell",
+ "pageserver_api",
+ "pageserver_client",
+ "postgres_connection",
+ "r2d2",
+ "reqwest",
+ "routerify",
+ "serde",
+ "serde_json",
+ "thiserror",
+ "tokio",
+ "tokio-util",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
+[[package]]
+name = "storcon_cli"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "clap",
+ "comfy-table",
+ "hyper",
+ "pageserver_api",
+ "pageserver_client",
+ "reqwest",
+ "serde",
+ "serde_json",
+ "thiserror",
+ "tokio",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "stringprep"
 version = "0.1.2"
@@ -5777,23 +5799,23 @@ dependencies = [
 
 [[package]]
 name = "test-context"
-version = "0.1.4"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3"
+checksum = "6676ab8513edfd2601a108621103fdb45cac9098305ca25ec93f7023b06b05d9"
 dependencies = [
- "async-trait",
  "futures",
  "test-context-macros",
 ]
 
 [[package]]
 name = "test-context-macros"
-version = "0.1.4"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d"
+checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1"
 dependencies = [
+ "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -5934,9 +5956,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokio"
-version = "1.36.0"
+version = "1.37.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
+checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787"
 dependencies = [
  "backtrace",
  "bytes",
diff --git a/Cargo.toml b/Cargo.toml
index 309ebbe119..3c6077648e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
 members = [
     "compute_tools",
     "control_plane",
-    "control_plane/attachment_service",
+    "control_plane/storcon_cli",
     "pageserver",
     "pageserver/compaction",
     "pageserver/ctl",
@@ -12,6 +12,7 @@ members = [
     "proxy",
     "safekeeper",
     "storage_broker",
+    "storage_controller",
     "s3_scrubber",
     "workspace_hack",
     "trace",
@@ -158,7 +159,7 @@ svg_fmt = "0.4.1"
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
-test-context = "0.1"
+test-context = "0.3"
 thiserror = "1.0"
 tikv-jemallocator = "0.5"
 tikv-jemalloc-ctl = "0.5"
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 401feae706..56495dd2da 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -14,9 +14,7 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
-use pageserver_api::controller_api::{
-    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
-};
+use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::models::{
     ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
@@ -1060,21 +1058,6 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
             }
         }
 
-        Some(("set-state", subcommand_args)) => {
-            let pageserver = get_pageserver(env, subcommand_args)?;
-            let scheduling = subcommand_args.get_one("scheduling");
-            let availability = subcommand_args.get_one("availability");
-
-            let storage_controller = StorageController::from_env(env);
-            storage_controller
-                .node_configure(NodeConfigureRequest {
-                    node_id: pageserver.conf.id,
-                    scheduling: scheduling.cloned(),
-                    availability: availability.cloned(),
-                })
-                .await?;
-        }
-
         Some(("status", subcommand_args)) => {
             match get_pageserver(env, subcommand_args)?.check_status().await {
                 Ok(_) => println!("Page server is up and running"),
@@ -1515,12 +1498,6 @@ fn cli() -> Command {
                     .about("Restart local pageserver")
                     .arg(pageserver_config_args.clone())
                 )
-                .subcommand(Command::new("set-state")
-                    .arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
-                    .arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
-                    .about("Set scheduling or availability state of pageserver node")
-                    .arg(pageserver_config_args.clone())
-                )
         )
         .subcommand(
             Command::new("storage_controller")
diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml
new file mode 100644
index 0000000000..61eb7fa4e4
--- /dev/null
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "storcon_cli"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+
+[dependencies]
+anyhow.workspace = true
+clap.workspace = true
+comfy-table.workspace = true
+hyper.workspace = true
+pageserver_api.workspace = true
+pageserver_client.workspace = true
+reqwest.workspace = true
+serde.workspace = true
+serde_json = { workspace = true, features = ["raw_value"] }
+thiserror.workspace = true
+tokio.workspace = true
+tracing.workspace = true
+utils.workspace = true
+workspace_hack.workspace = true
+
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
new file mode 100644
index 0000000000..2edd09eac1
--- /dev/null
+++ b/control_plane/storcon_cli/src/main.rs
@@ -0,0 +1,587 @@
+use std::{collections::HashMap, str::FromStr};
+
+use clap::{Parser, Subcommand};
+use hyper::Method;
+use pageserver_api::{
+    controller_api::{
+        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
+        TenantDescribeResponse, TenantPolicyRequest,
+    },
+    models::{
+        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
+        TenantShardSplitRequest, TenantShardSplitResponse,
+    },
+    shard::{ShardStripeSize, TenantShardId},
+};
+use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
+use reqwest::Url;
+use serde::{de::DeserializeOwned, Serialize};
+use utils::id::{NodeId, TenantId};
+
+use pageserver_api::controller_api::{
+    NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
+    TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
+};
+
+#[derive(Subcommand, Debug)]
+enum Command {
+    /// Register a pageserver with the storage controller.  This shouldn't usually be necessary,
+    /// since pageservers auto-register when they start up
+    NodeRegister {
+        #[arg(long)]
+        node_id: NodeId,
+
+        #[arg(long)]
+        listen_pg_addr: String,
+        #[arg(long)]
+        listen_pg_port: u16,
+
+        #[arg(long)]
+        listen_http_addr: String,
+        #[arg(long)]
+        listen_http_port: u16,
+    },
+
+    /// Modify a node's configuration in the storage controller
+    NodeConfigure {
+        #[arg(long)]
+        node_id: NodeId,
+
+        /// Availability is usually auto-detected based on heartbeats.  Set 'offline' here to
+        /// manually mark a node offline
+        #[arg(long)]
+        availability: Option<NodeAvailabilityArg>,
+        /// Scheduling policy controls whether tenant shards may be scheduled onto this node.
+        #[arg(long)]
+        scheduling: Option<NodeSchedulingPolicy>,
+    },
+    /// Modify a tenant's policies in the storage controller
+    TenantPolicy {
+        #[arg(long)]
+        tenant_id: TenantId,
+        /// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`),
+        /// or is in the normal attached state with N secondary locations (`attached:N`)
+        #[arg(long)]
+        placement: Option<PlacementPolicyArg>,
+        /// Scheduling policy enables pausing the controller's scheduling activity involving this tenant.  `active` is normal,
+        /// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents
+        /// all reconciliation activity including for scheduling changes already made.  `pause` and `stop` can make a tenant
+        /// unavailable, and are only for use in emergencies.
+        #[arg(long)]
+        scheduling: Option<ShardSchedulingPolicyArg>,
+    },
+    /// List nodes known to the storage controller
+    Nodes {},
+    /// List tenants known to the storage controller
+    Tenants {},
+    /// Create a new tenant in the storage controller, and by extension on pageservers.
+    TenantCreate {
+        #[arg(long)]
+        tenant_id: TenantId,
+    },
+    /// Delete a tenant in the storage controller, and by extension on pageservers.
+    TenantDelete {
+        #[arg(long)]
+        tenant_id: TenantId,
+    },
+    /// Split an existing tenant into a higher number of shards than its current shard count.
+    TenantShardSplit {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        shard_count: u8,
+        /// Optional, in 8kiB pages.  e.g. set 2048 for 16MB stripes.
+        #[arg(long)]
+        stripe_size: Option<u32>,
+    },
+    /// Migrate the attached location for a tenant shard to a specific pageserver.
+    TenantShardMigrate {
+        #[arg(long)]
+        tenant_shard_id: TenantShardId,
+        #[arg(long)]
+        node: NodeId,
+    },
+    /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
+    /// that is passed through to pageservers, and does not affect storage controller behavior.
+    TenantConfig {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        config: String,
+    },
+    /// Attempt to balance the locations for a tenant across pageservers.  This is a client-side
+    /// alternative to the storage controller's scheduling optimization behavior.
+    TenantScatter {
+        #[arg(long)]
+        tenant_id: TenantId,
+    },
+    /// Print details about a particular tenant, including all its shards' states.
+    TenantDescribe {
+        #[arg(long)]
+        tenant_id: TenantId,
+    },
+}
+
+#[derive(Parser)]
+#[command(
+    author,
+    version,
+    about,
+    long_about = "CLI for Storage Controller Support/Debug"
+)]
+#[command(arg_required_else_help(true))]
+struct Cli {
+    #[arg(long)]
+    /// URL to storage controller.  e.g. http://127.0.0.1:1234 when using `neon_local`
+    api: Url,
+
+    #[arg(long)]
+    /// JWT token for authenticating with storage controller.  Depending on the API used, this
+    /// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint
+    /// a token with both scopes to use with this tool.
+    jwt: Option<String>,
+
+    #[command(subcommand)]
+    command: Command,
+}
+
+#[derive(Debug, Clone)]
+struct PlacementPolicyArg(PlacementPolicy);
+
+impl FromStr for PlacementPolicyArg {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "detached" => Ok(Self(PlacementPolicy::Detached)),
+            "secondary" => Ok(Self(PlacementPolicy::Secondary)),
+            _ if s.starts_with("attached:") => {
+                let mut splitter = s.split(':');
+                let _prefix = splitter.next().unwrap();
+                match splitter.next().and_then(|s| s.parse::<usize>().ok()) {
+                    Some(n) => Ok(Self(PlacementPolicy::Attached(n))),
+                    None => Err(anyhow::anyhow!(
+                        "Invalid format '{s}', a valid example is 'attached:1'"
+                    )),
+                }
+            }
+            _ => Err(anyhow::anyhow!(
+                "Unknown placement policy '{s}', try detached,secondary,attached:<n>"
+            )),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
+
+impl FromStr for ShardSchedulingPolicyArg {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self(ShardSchedulingPolicy::Active)),
+            "essential" => Ok(Self(ShardSchedulingPolicy::Essential)),
+            "pause" => Ok(Self(ShardSchedulingPolicy::Pause)),
+            "stop" => Ok(Self(ShardSchedulingPolicy::Stop)),
+            _ => Err(anyhow::anyhow!(
+                "Unknown scheduling policy '{s}', try active,essential,pause,stop"
+            )),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+struct NodeAvailabilityArg(NodeAvailabilityWrapper);
+
+impl FromStr for NodeAvailabilityArg {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self(NodeAvailabilityWrapper::Active)),
+            "offline" => Ok(Self(NodeAvailabilityWrapper::Offline)),
+            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
+        }
+    }
+}
+
+struct Client {
+    base_url: Url,
+    jwt_token: Option<String>,
+    client: reqwest::Client,
+}
+
+impl Client {
+    fn new(base_url: Url, jwt_token: Option<String>) -> Self {
+        Self {
+            base_url,
+            jwt_token,
+            client: reqwest::ClientBuilder::new()
+                .build()
+                .expect("Failed to construct http client"),
+        }
+    }
+
+    /// Simple HTTP request wrapper for calling into storage controller
+    async fn dispatch<RQ, RS>(
+        &self,
+        method: hyper::Method,
+        path: String,
+        body: Option<RQ>,
+    ) -> mgmt_api::Result<RS>
+    where
+        RQ: Serialize + Sized,
+        RS: DeserializeOwned + Sized,
+    {
+        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
+        // for general purpose API access.
+        let url = Url::from_str(&format!(
+            "http://{}:{}/{path}",
+            self.base_url.host_str().unwrap(),
+            self.base_url.port().unwrap()
+        ))
+        .unwrap();
+
+        let mut builder = self.client.request(method, url);
+        if let Some(body) = body {
+            builder = builder.json(&body)
+        }
+        if let Some(jwt_token) = &self.jwt_token {
+            builder = builder.header(
+                reqwest::header::AUTHORIZATION,
+                format!("Bearer {jwt_token}"),
+            );
+        }
+
+        let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
+        let response = response.error_from_body().await?;
+
+        response
+            .json()
+            .await
+            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
+    }
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let cli = Cli::parse();
+
+    let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
+
+    let mut trimmed = cli.api.to_string();
+    trimmed.pop();
+    let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
+
+    match cli.command {
+        Command::NodeRegister {
+            node_id,
+            listen_pg_addr,
+            listen_pg_port,
+            listen_http_addr,
+            listen_http_port,
+        } => {
+            storcon_client
+                .dispatch::<_, ()>(
+                    Method::POST,
+                    "control/v1/node".to_string(),
+                    Some(NodeRegisterRequest {
+                        node_id,
+                        listen_pg_addr,
+                        listen_pg_port,
+                        listen_http_addr,
+                        listen_http_port,
+                    }),
+                )
+                .await?;
+        }
+        Command::TenantCreate { tenant_id } => {
+            vps_client
+                .tenant_create(&TenantCreateRequest {
+                    new_tenant_id: TenantShardId::unsharded(tenant_id),
+                    generation: None,
+                    shard_parameters: ShardParameters::default(),
+                    placement_policy: Some(PlacementPolicy::Attached(1)),
+                    config: TenantConfig::default(),
+                })
+                .await?;
+        }
+        Command::TenantDelete { tenant_id } => {
+            let status = vps_client
+                .tenant_delete(TenantShardId::unsharded(tenant_id))
+                .await?;
+            tracing::info!("Delete status: {}", status);
+        }
+        Command::Nodes {} => {
+            let resp = storcon_client
+                .dispatch::<(), Vec<NodeDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/node".to_string(),
+                    None,
+                )
+                .await?;
+            let mut table = comfy_table::Table::new();
+            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
+            for node in resp {
+                table.add_row([
+                    format!("{}", node.id),
+                    node.listen_http_addr,
+                    format!("{:?}", node.scheduling),
+                    format!("{:?}", node.availability),
+                ]);
+            }
+            println!("{table}");
+        }
+        Command::NodeConfigure {
+            node_id,
+            availability,
+            scheduling,
+        } => {
+            let req = NodeConfigureRequest {
+                node_id,
+                availability: availability.map(|a| a.0),
+                scheduling,
+            };
+            storcon_client
+                .dispatch::<_, ()>(
+                    Method::PUT,
+                    format!("control/v1/node/{node_id}/config"),
+                    Some(req),
+                )
+                .await?;
+        }
+        Command::Tenants {} => {
+            let resp = storcon_client
+                .dispatch::<(), Vec<TenantDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/tenant".to_string(),
+                    None,
+                )
+                .await?;
+            let mut table = comfy_table::Table::new();
+            table.set_header([
+                "TenantId",
+                "ShardCount",
+                "StripeSize",
+                "Placement",
+                "Scheduling",
+            ]);
+            for tenant in resp {
+                let shard_zero = tenant.shards.into_iter().next().unwrap();
+                table.add_row([
+                    format!("{}", tenant.tenant_id),
+                    format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
+                    format!("{:?}", tenant.stripe_size),
+                    format!("{:?}", tenant.policy),
+                    format!("{:?}", shard_zero.scheduling_policy),
+                ]);
+            }
+
+            println!("{table}");
+        }
+        Command::TenantPolicy {
+            tenant_id,
+            placement,
+            scheduling,
+        } => {
+            let req = TenantPolicyRequest {
+                scheduling: scheduling.map(|s| s.0),
+                placement: placement.map(|p| p.0),
+            };
+            storcon_client
+                .dispatch::<_, ()>(
+                    Method::PUT,
+                    format!("control/v1/tenant/{tenant_id}/policy"),
+                    Some(req),
+                )
+                .await?;
+        }
+        Command::TenantShardSplit {
+            tenant_id,
+            shard_count,
+            stripe_size,
+        } => {
+            let req = TenantShardSplitRequest {
+                new_shard_count: shard_count,
+                new_stripe_size: stripe_size.map(ShardStripeSize),
+            };
+
+            let response = storcon_client
+                .dispatch::<TenantShardSplitRequest, TenantShardSplitResponse>(
+                    Method::PUT,
+                    format!("control/v1/tenant/{tenant_id}/shard_split"),
+                    Some(req),
+                )
+                .await?;
+            println!(
+                "Split tenant {} into {} shards: {}",
+                tenant_id,
+                shard_count,
+                response
+                    .new_shards
+                    .iter()
+                    .map(|s| format!("{:?}", s))
+                    .collect::<Vec<_>>()
+                    .join(",")
+            );
+        }
+        Command::TenantShardMigrate {
+            tenant_shard_id,
+            node,
+        } => {
+            let req = TenantShardMigrateRequest {
+                tenant_shard_id,
+                node_id: node,
+            };
+
+            storcon_client
+                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
+                    Method::PUT,
+                    format!("control/v1/tenant/{tenant_shard_id}/migrate"),
+                    Some(req),
+                )
+                .await?;
+        }
+        Command::TenantConfig { tenant_id, config } => {
+            let tenant_conf = serde_json::from_str(&config)?;
+
+            vps_client
+                .tenant_config(&TenantConfigRequest {
+                    tenant_id,
+                    config: tenant_conf,
+                })
+                .await?;
+        }
+        Command::TenantScatter { tenant_id } => {
+            // Find the shards
+            let locate_response = storcon_client
+                .dispatch::<(), TenantLocateResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}/locate"),
+                    None,
+                )
+                .await?;
+            let shards = locate_response.shards;
+
+            let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
+            let shard_count = shards.len();
+            for s in shards {
+                let entry = node_to_shards.entry(s.node_id).or_default();
+                entry.push(s.shard_id);
+            }
+
+            // Load list of available nodes
+            let nodes_resp = storcon_client
+                .dispatch::<(), Vec<NodeDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/node".to_string(),
+                    None,
+                )
+                .await?;
+
+            for node in nodes_resp {
+                if matches!(node.availability, NodeAvailabilityWrapper::Active) {
+                    node_to_shards.entry(node.id).or_default();
+                }
+            }
+
+            let max_shard_per_node = shard_count / node_to_shards.len();
+
+            loop {
+                let mut migrate_shard = None;
+                for shards in node_to_shards.values_mut() {
+                    if shards.len() > max_shard_per_node {
+                        // Pick the emptiest
+                        migrate_shard = Some(shards.pop().unwrap());
+                    }
+                }
+                let Some(migrate_shard) = migrate_shard else {
+                    break;
+                };
+
+                // Pick the emptiest node to migrate to
+                let mut destinations = node_to_shards
+                    .iter()
+                    .map(|(k, v)| (k, v.len()))
+                    .collect::<Vec<_>>();
+                destinations.sort_by_key(|i| i.1);
+                let (destination_node, destination_count) = *destinations.first().unwrap();
+                if destination_count + 1 > max_shard_per_node {
+                    // Even the emptiest destination doesn't have space: we're done
+                    break;
+                }
+                let destination_node = *destination_node;
+
+                node_to_shards
+                    .get_mut(&destination_node)
+                    .unwrap()
+                    .push(migrate_shard);
+
+                println!("Migrate {} -> {} ...", migrate_shard, destination_node);
+
+                storcon_client
+                    .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
+                        Method::PUT,
+                        format!("control/v1/tenant/{migrate_shard}/migrate"),
+                        Some(TenantShardMigrateRequest {
+                            tenant_shard_id: migrate_shard,
+                            node_id: destination_node,
+                        }),
+                    )
+                    .await?;
+                println!("Migrate {} -> {} OK", migrate_shard, destination_node);
+            }
+
+            // Spread the shards across the nodes
+        }
+        Command::TenantDescribe { tenant_id } => {
+            let describe_response = storcon_client
+                .dispatch::<(), TenantDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}"),
+                    None,
+                )
+                .await?;
+            let shards = describe_response.shards;
+            let mut table = comfy_table::Table::new();
+            table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
+            for shard in shards {
+                let secondary = shard
+                    .node_secondary
+                    .iter()
+                    .map(|n| format!("{}", n))
+                    .collect::<Vec<_>>()
+                    .join(",");
+
+                let mut status_parts = Vec::new();
+                if shard.is_reconciling {
+                    status_parts.push("reconciling");
+                }
+
+                if shard.is_pending_compute_notification {
+                    status_parts.push("pending_compute");
+                }
+
+                if shard.is_splitting {
+                    status_parts.push("splitting");
+                }
+                let status = status_parts.join(",");
+
+                table.add_row([
+                    format!("{}", shard.tenant_shard_id),
+                    shard
+                        .node_attached
+                        .map(|n| format!("{}", n))
+                        .unwrap_or(String::new()),
+                    secondary,
+                    shard.last_error,
+                    status,
+                ]);
+            }
+            println!("{table}");
+        }
+    }
+
+    Ok(())
+}
diff --git a/diesel.toml b/diesel.toml
index 30ed4444d7..558c54a1e1 100644
--- a/diesel.toml
+++ b/diesel.toml
@@ -2,8 +2,8 @@
 # see https://diesel.rs/guides/configuring-diesel-cli
 
 [print_schema]
-file = "control_plane/attachment_service/src/schema.rs"
+file = "storage_controller/src/schema.rs"
 custom_type_derives = ["diesel::query_builder::QueryId"]
 
 [migrations_directory]
-dir = "control_plane/attachment_service/migrations"
+dir = "storage_controller/migrations"
diff --git a/docs/sourcetree.md b/docs/sourcetree.md
index 12fa80349e..3732bfdab2 100644
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -7,6 +7,11 @@ Below you will find a brief overview of each subdir in the source tree in alphab
 Neon storage broker, providing messaging between safekeepers and pageservers.
 [storage_broker.md](./storage_broker.md)
 
+`storage_controller`:
+
+Neon storage controller, manages a cluster of pageservers and exposes an API that enables
+managing a many-sharded tenant as a single entity.
+
 `/control_plane`:
 
 Local control plane.
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index dcf9e38106..1278f17ad2 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -2,9 +2,9 @@ use std::str::FromStr;
 
 /// Request/response types for the storage controller
 /// API (`/control/v1` prefix).  Implemented by the server
-/// in [`attachment_service::http`]
+/// in [`storage_controller::http`]
 use serde::{Deserialize, Serialize};
-use utils::id::NodeId;
+use utils::id::{NodeId, TenantId};
 
 use crate::{
     models::{ShardParameters, TenantConfig},
@@ -68,12 +68,27 @@ pub struct TenantLocateResponse {
 
 #[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponse {
+    pub tenant_id: TenantId,
     pub shards: Vec<TenantDescribeResponseShard>,
     pub stripe_size: ShardStripeSize,
     pub policy: PlacementPolicy,
     pub config: TenantConfig,
 }
 
+#[derive(Serialize, Deserialize)]
+pub struct NodeDescribeResponse {
+    pub id: NodeId,
+
+    pub availability: NodeAvailabilityWrapper,
+    pub scheduling: NodeSchedulingPolicy,
+
+    pub listen_http_addr: String,
+    pub listen_http_port: u16,
+
+    pub listen_pg_addr: String,
+    pub listen_pg_port: u16,
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponseShard {
     pub tenant_shard_id: TenantShardId,
@@ -89,6 +104,8 @@ pub struct TenantDescribeResponseShard {
     pub is_pending_compute_notification: bool,
     /// A shard split is currently underway
     pub is_splitting: bool,
+
+    pub scheduling_policy: ShardSchedulingPolicy,
 }
 
 /// Explicitly migrating a particular shard is a low level operation
@@ -103,7 +120,7 @@ pub struct TenantShardMigrateRequest {
 /// Utilisation score indicating how good a candidate a pageserver
 /// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
 /// Lower values are better.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
 pub struct UtilizationScore(pub u64);
 
 impl UtilizationScore {
@@ -112,7 +129,7 @@ impl UtilizationScore {
     }
 }
 
-#[derive(Serialize, Clone, Copy)]
+#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
 #[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
     // Normal, happy state
@@ -135,7 +152,7 @@ impl Eq for NodeAvailability {}
 // This wrapper provides serde functionality and it should only be used to
 // communicate with external callers which don't know or care about the
 // utilisation score of the pageserver it is targeting.
-#[derive(Serialize, Deserialize, Clone)]
+#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
 pub enum NodeAvailabilityWrapper {
     Active,
     Offline,
@@ -161,21 +178,6 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
     }
 }
 
-impl FromStr for NodeAvailability {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            // This is used when parsing node configuration requests from neon-local.
-            // Assume the worst possible utilisation score
-            // and let it get updated via the heartbeats.
-            "active" => Ok(Self::Active(UtilizationScore::worst())),
-            "offline" => Ok(Self::Offline),
-            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
-        }
-    }
-}
-
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
 pub enum ShardSchedulingPolicy {
     // Normal mode: the tenant's scheduled locations may be updated at will, including
@@ -202,7 +204,7 @@ impl Default for ShardSchedulingPolicy {
     }
 }
 
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
 pub enum NodeSchedulingPolicy {
     Active,
     Filling,
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index ab2035f19a..e708854be2 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -565,6 +565,16 @@ impl GenericRemoteStorage {
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct StorageMetadata(HashMap<String, String>);
 
+impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
+    fn from(arr: [(&str, &str); N]) -> Self {
+        let map: HashMap<String, String> = arr
+            .iter()
+            .map(|(k, v)| (k.to_string(), v.to_string()))
+            .collect();
+        Self(map)
+    }
+}
+
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs
index 6adddf52a9..6aa02868e6 100644
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -57,7 +57,6 @@ enum MaybeEnabledStorage {
     Disabled,
 }
 
-#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorage {
     async fn setup() -> Self {
         ensure_logging_ready();
@@ -86,7 +85,6 @@ struct AzureWithTestBlobs {
     remote_blobs: HashSet<RemotePath>,
 }
 
-#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
     async fn setup() -> Self {
         ensure_logging_ready();
@@ -148,7 +146,6 @@ struct AzureWithSimpleTestBlobs {
     remote_blobs: HashSet<RemotePath>,
 }
 
-#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
     async fn setup() -> Self {
         ensure_logging_ready();
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index bc5e40e70f..c5d5216f00 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -219,7 +219,6 @@ enum MaybeEnabledStorage {
     Disabled,
 }
 
-#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorage {
     async fn setup() -> Self {
         ensure_logging_ready();
@@ -248,7 +247,6 @@ struct S3WithTestBlobs {
     remote_blobs: HashSet<RemotePath>,
 }
 
-#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
     async fn setup() -> Self {
         ensure_logging_ready();
@@ -310,7 +308,6 @@ struct S3WithSimpleTestBlobs {
     remote_blobs: HashSet<RemotePath>,
 }
 
-#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
     async fn setup() -> Self {
         ensure_logging_ready();
diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs
index b7301776eb..0544c5be03 100644
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -182,6 +182,18 @@ where
         }
     }
 
+    /// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`.
+    pub fn would_wait_for(&self, num: V) -> Result<(), V> {
+        let internal = self.internal.lock().unwrap();
+        let cnt = internal.current.cnt_value();
+        drop(internal);
+        if cnt >= num {
+            Ok(())
+        } else {
+            Err(cnt)
+        }
+    }
+
     /// Register and return a channel that will be notified when a number arrives,
     /// or None, if it has already arrived.
     fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index 3efad546a6..ffe607be4b 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -27,25 +27,25 @@
 //!
 //! # Reference Numbers
 //!
-//! 2024-03-20 on i3en.3xlarge
+//! 2024-04-04 on i3en.3xlarge
 //!
 //! ```text
-//! short/1                 time:   [26.483 µs 26.614 µs 26.767 µs]
-//! short/2                 time:   [32.223 µs 32.465 µs 32.767 µs]
-//! short/4                 time:   [47.203 µs 47.583 µs 47.984 µs]
-//! short/8                 time:   [89.135 µs 89.612 µs 90.139 µs]
-//! short/16                time:   [190.12 µs 191.52 µs 192.88 µs]
-//! short/32                time:   [380.96 µs 382.63 µs 384.20 µs]
-//! short/64                time:   [736.86 µs 741.07 µs 745.03 µs]
-//! short/128               time:   [1.4106 ms 1.4206 ms 1.4294 ms]
-//! medium/1                time:   [111.81 µs 112.25 µs 112.79 µs]
-//! medium/2                time:   [158.26 µs 159.13 µs 160.21 µs]
-//! medium/4                time:   [334.65 µs 337.14 µs 340.07 µs]
-//! medium/8                time:   [675.32 µs 679.91 µs 685.25 µs]
-//! medium/16               time:   [1.2929 ms 1.2996 ms 1.3067 ms]
-//! medium/32               time:   [2.4295 ms 2.4461 ms 2.4623 ms]
-//! medium/64               time:   [4.3973 ms 4.4458 ms 4.4875 ms]
-//! medium/128              time:   [7.5955 ms 7.7847 ms 7.9481 ms]
+//! short/1                 time:   [25.925 µs 26.060 µs 26.209 µs]
+//! short/2                 time:   [31.277 µs 31.483 µs 31.722 µs]
+//! short/4                 time:   [45.496 µs 45.831 µs 46.182 µs]
+//! short/8                 time:   [84.298 µs 84.920 µs 85.566 µs]
+//! short/16                time:   [185.04 µs 186.41 µs 187.88 µs]
+//! short/32                time:   [385.01 µs 386.77 µs 388.70 µs]
+//! short/64                time:   [770.24 µs 773.04 µs 776.04 µs]
+//! short/128               time:   [1.5017 ms 1.5064 ms 1.5113 ms]
+//! medium/1                time:   [106.65 µs 107.20 µs 107.85 µs]
+//! medium/2                time:   [153.28 µs 154.24 µs 155.56 µs]
+//! medium/4                time:   [325.67 µs 327.01 µs 328.71 µs]
+//! medium/8                time:   [646.82 µs 650.17 µs 653.91 µs]
+//! medium/16               time:   [1.2645 ms 1.2701 ms 1.2762 ms]
+//! medium/32               time:   [2.4409 ms 2.4550 ms 2.4692 ms]
+//! medium/64               time:   [4.6814 ms 4.7114 ms 4.7408 ms]
+//! medium/128              time:   [8.7790 ms 8.9037 ms 9.0282 ms]
 //! ```
 
 use bytes::{Buf, Bytes};
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index ab55d2b0a3..3c9982ffb8 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -128,12 +128,12 @@ impl Client {
 
     pub async fn timeline_info(
         &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
         force_await_logical_size: ForceAwaitLogicalSize,
     ) -> Result<pageserver_api::models::TimelineInfo> {
         let uri = format!(
-            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
             self.mgmt_api_endpoint
         );
 
@@ -151,11 +151,11 @@ impl Client {
 
     pub async fn keyspace(
         &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
     ) -> Result<pageserver_api::models::partitioning::Partitioning> {
         let uri = format!(
-            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace",
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/keyspace",
             self.mgmt_api_endpoint
         );
         self.get(&uri)
diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs
index 55844be041..3ae6d99aa7 100644
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -1,4 +1,5 @@
 use anyhow::Context;
+use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
 use pageserver_client::page_service::BasebackupRequest;
 
@@ -95,7 +96,7 @@ async fn main_impl(
             let timeline = *timeline;
             let info = mgmt_api_client
                 .timeline_info(
-                    timeline.tenant_id,
+                    TenantShardId::unsharded(timeline.tenant_id),
                     timeline.timeline_id,
                     ForceAwaitLogicalSize::No,
                 )
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index 2838511a77..c3d8e61a2c 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -4,6 +4,7 @@ use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::models::PagestreamGetPageRequest;
 
+use pageserver_api::shard::TenantShardId;
 use tokio_util::sync::CancellationToken;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
@@ -173,7 +174,10 @@ async fn main_impl(
                 let timeline = *timeline;
                 async move {
                     let partitioning = mgmt_api_client
-                        .keyspace(timeline.tenant_id, timeline.timeline_id)
+                        .keyspace(
+                            TenantShardId::unsharded(timeline.tenant_id),
+                            timeline.timeline_id,
+                        )
                         .await?;
                     let lsn = partitioning.at_lsn;
                     let start = Instant::now();
diff --git a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
index 98938d780a..f07beeecfd 100644
--- a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
+++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
@@ -1,6 +1,7 @@
 use std::sync::Arc;
 
 use humantime::Duration;
+use pageserver_api::shard::TenantShardId;
 use tokio::task::JoinSet;
 use utils::id::TenantTimelineId;
 
@@ -59,7 +60,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
         let mgmt_api_client = Arc::clone(&mgmt_api_client);
         js.spawn(async move {
             let info = mgmt_api_client
-                .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
+                .timeline_info(
+                    TenantShardId::unsharded(tl.tenant_id),
+                    tl.timeline_id,
+                    ForceAwaitLogicalSize::Yes,
+                )
                 .await
                 .unwrap();
 
@@ -74,7 +79,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
                 while !info.current_logical_size_is_accurate {
                     ticker.tick().await;
                     info = mgmt_api_client
-                        .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
+                        .timeline_info(
+                            TenantShardId::unsharded(tl.tenant_id),
+                            tl.timeline_id,
+                            ForceAwaitLogicalSize::Yes,
+                        )
                         .await
                         .unwrap();
                 }
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index 42c800822b..f0ed46ce23 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -12,7 +12,7 @@ use pageserver_api::{
 use serde::{de::DeserializeOwned, Serialize};
 use tokio_util::sync::CancellationToken;
 use url::Url;
-use utils::{backoff, generation::Generation, id::NodeId};
+use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
 
 use crate::{
     config::{NodeMetadata, PageServerConf},
@@ -210,7 +210,10 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                 .collect(),
         };
 
-        fail::fail_point!("control-plane-client-validate");
+        failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel);
+        if self.cancel.is_cancelled() {
+            return Err(RetryForeverError::ShuttingDown);
+        }
 
         let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
 
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index bb477f89c5..2713309824 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1629,7 +1629,7 @@ components:
           type: integer
           format: int64
           minimum: 0
-          description: The amount of disk space currently utilized by layer files.
+          description: The amount of disk space currently used.
         free_space_bytes:
           type: integer
           format: int64
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 759a1b25ee..47d8ae1148 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -993,11 +993,26 @@ async fn tenant_status(
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let state = get_state(&request);
 
+    // In tests, sometimes we want to query the state of a tenant without auto-activating it if it's currently waiting.
+    let activate = true;
+    #[cfg(feature = "testing")]
+    let activate = parse_query_param(&request, "activate")?.unwrap_or(activate);
+
     let tenant_info = async {
         let tenant = state
             .tenant_manager
             .get_attached_tenant_shard(tenant_shard_id)?;
 
+        if activate {
+            // This is advisory: we prefer to let the tenant activate on-demand when this function is
+            // called, but it is still valid to return 200 and describe the current state of the tenant
+            // if it doesn't make it into an active state.
+            tenant
+                .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
+                .await
+                .ok();
+        }
+
         // Calculate total physical size of all timelines
         let mut current_physical_size = 0;
         for timeline in tenant.list_timelines().iter() {
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 343dec2ca1..ed409d3130 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -8,6 +8,7 @@ use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use camino::Utf8Path;
 use futures::StreamExt;
+use pageserver_api::key::rel_block_to_key;
 use tokio::io::{AsyncRead, AsyncReadExt};
 use tokio_tar::Archive;
 use tracing::*;
@@ -170,7 +171,10 @@ async fn import_rel(
         let r = reader.read_exact(&mut buf).await;
         match r {
             Ok(_) => {
-                modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
+                let key = rel_block_to_key(rel, blknum);
+                if modification.tline.get_shard_identity().is_key_local(&key) {
+                    modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
+                }
             }
 
             // TODO: UnexpectedEof is expected
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index cc661194e9..ab9a2e8509 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1483,12 +1483,18 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
 });
 
 pub(crate) struct WalIngestMetrics {
+    pub(crate) bytes_received: IntCounter,
     pub(crate) records_received: IntCounter,
     pub(crate) records_committed: IntCounter,
     pub(crate) records_filtered: IntCounter,
 }
 
 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
+    bytes_received: register_int_counter!(
+        "pageserver_wal_ingest_bytes_received",
+        "Bytes of WAL ingested from safekeepers",
+    )
+    .unwrap(),
     records_received: register_int_counter!(
         "pageserver_wal_ingest_records_received",
         "Number of WAL records received from safekeepers"
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 3d622f1871..3b9a30ba4c 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -876,7 +876,13 @@ impl PageServerHandler {
             if lsn <= last_record_lsn {
                 lsn = last_record_lsn;
             } else {
-                timeline.wait_lsn(lsn, ctx).await?;
+                timeline
+                    .wait_lsn(
+                        lsn,
+                        crate::tenant::timeline::WaitLsnWaiter::PageService,
+                        ctx,
+                    )
+                    .await?;
                 // Since we waited for 'lsn' to arrive, that is now the last
                 // record LSN. (Or close enough for our purposes; the
                 // last-record LSN can advance immediately after we return
@@ -888,7 +894,13 @@ impl PageServerHandler {
                     "invalid LSN(0) in request".into(),
                 ));
             }
-            timeline.wait_lsn(lsn, ctx).await?;
+            timeline
+                .wait_lsn(
+                    lsn,
+                    crate::tenant::timeline::WaitLsnWaiter::PageService,
+                    ctx,
+                )
+                .await?;
         }
 
         if lsn < **latest_gc_cutoff_lsn {
@@ -1215,7 +1227,13 @@ impl PageServerHandler {
         if let Some(lsn) = lsn {
             // Backup was requested at a particular LSN. Wait for it to arrive.
             info!("waiting for {}", lsn);
-            timeline.wait_lsn(lsn, ctx).await?;
+            timeline
+                .wait_lsn(
+                    lsn,
+                    crate::tenant::timeline::WaitLsnWaiter::PageService,
+                    ctx,
+                )
+                .await?;
             timeline
                 .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
                 .context("invalid basebackup lsn")?;
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 69e163effa..0cc5611a12 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -214,13 +214,12 @@ pub enum TaskKind {
     /// Internally, `Client` hands over requests to the `Connection` object.
     /// The `Connection` object is responsible for speaking the wire protocol.
     ///
-    /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
-    /// That abstraction doesn't use `task_mgr`.
+    /// Walreceiver uses a legacy abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
     /// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
     /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
     ///
-    /// Once the connection is established, the `TaskHandle` task creates a
-    /// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling
+    /// Once the connection is established, the `TaskHandle` task spawns a
+    /// [`WalReceiverConnectionPoller`] task that is responsible for polling
     /// the `Connection` object.
     /// A `CancellationToken` created by the `TaskHandle` task ensures
     /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
@@ -230,7 +229,6 @@ pub enum TaskKind {
     WalReceiverManager,
 
     /// The `TaskHandle` task that executes `handle_walreceiver_connection`.
-    /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
     /// See the comment on [`WalReceiverManager`].
     ///
     /// [`WalReceiverManager`]: Self::WalReceiverManager
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0806ef0cf4..17ff033e00 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12,6 +12,7 @@
 //!
 
 use anyhow::{bail, Context};
+use arc_swap::ArcSwap;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use enumset::EnumSet;
@@ -98,7 +99,7 @@ use std::ops::Bound::Included;
 use std::sync::atomic::AtomicU64;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
-use std::sync::{Mutex, RwLock};
+use std::sync::Mutex;
 use std::time::{Duration, Instant};
 
 use crate::span;
@@ -260,7 +261,7 @@ pub struct Tenant {
     // We keep TenantConfOpt sturct here to preserve the information
     // about parameters that are not set.
     // This is necessary to allow global config updates.
-    tenant_conf: Arc<RwLock<AttachedTenantConf>>,
+    tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
 
     tenant_shard_id: TenantShardId,
 
@@ -1515,7 +1516,7 @@ impl Tenant {
                     // sizes etc. and that would get confused if the previous page versions
                     // are not in the repository yet.
                     ancestor_timeline
-                        .wait_lsn(*lsn, ctx)
+                        .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
                         .await
                         .map_err(|e| match e {
                             e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
@@ -1606,7 +1607,7 @@ impl Tenant {
         );
 
         {
-            let conf = self.tenant_conf.read().unwrap();
+            let conf = self.tenant_conf.load();
 
             if !conf.location.may_delete_layers_hint() {
                 info!("Skipping GC in location state {:?}", conf.location);
@@ -1633,7 +1634,7 @@ impl Tenant {
         }
 
         {
-            let conf = self.tenant_conf.read().unwrap();
+            let conf = self.tenant_conf.load();
             if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
                 info!("Skipping compaction in location state {:?}", conf.location);
                 return Ok(());
@@ -1782,7 +1783,7 @@ impl Tenant {
     async fn shutdown(
         &self,
         shutdown_progress: completion::Barrier,
-        freeze_and_flush: bool,
+        shutdown_mode: timeline::ShutdownMode,
     ) -> Result<(), completion::Barrier> {
         span::debug_assert_current_span_has_tenant_id();
 
@@ -1829,16 +1830,8 @@ impl Tenant {
             timelines.values().for_each(|timeline| {
                 let timeline = Arc::clone(timeline);
                 let timeline_id = timeline.timeline_id;
-
-                let span =
-                    tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush);
-                js.spawn(async move {
-                    if freeze_and_flush {
-                        timeline.flush_and_shutdown().instrument(span).await
-                    } else {
-                        timeline.shutdown().instrument(span).await
-                    }
-                });
+                let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode);
+                js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await });
             })
         };
         // test_long_timeline_create_then_tenant_delete is leaning on this message
@@ -2082,14 +2075,14 @@ impl Tenant {
     }
 
     pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
-        self.tenant_conf.read().unwrap().location.attach_mode
+        self.tenant_conf.load().location.attach_mode
     }
 
     /// For API access: generate a LocationConfig equivalent to the one that would be used to
     /// create a Tenant in the same state.  Do not use this in hot paths: it's for relatively
     /// rare external API calls, like a reconciliation at startup.
     pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
-        let conf = self.tenant_conf.read().unwrap();
+        let conf = self.tenant_conf.load();
 
         let location_config_mode = match conf.location.attach_mode {
             AttachmentMode::Single => models::LocationConfigMode::AttachedSingle,
@@ -2236,7 +2229,7 @@ where
 
 impl Tenant {
     pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
-        self.tenant_conf.read().unwrap().tenant_conf.clone()
+        self.tenant_conf.load().tenant_conf.clone()
     }
 
     pub fn effective_config(&self) -> TenantConf {
@@ -2245,84 +2238,84 @@ impl Tenant {
     }
 
     pub fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .checkpoint_distance
             .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
     }
 
     pub fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .checkpoint_timeout
             .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
     }
 
     pub fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .compaction_target_size
             .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
     }
 
     pub fn get_compaction_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .compaction_period
             .unwrap_or(self.conf.default_tenant_conf.compaction_period)
     }
 
     pub fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .compaction_threshold
             .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
     }
 
     pub fn get_gc_horizon(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .gc_horizon
             .unwrap_or(self.conf.default_tenant_conf.gc_horizon)
     }
 
     pub fn get_gc_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .gc_period
             .unwrap_or(self.conf.default_tenant_conf.gc_period)
     }
 
     pub fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .image_creation_threshold
             .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
     }
 
     pub fn get_pitr_interval(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .pitr_interval
             .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
     }
 
     pub fn get_trace_read_requests(&self) -> bool {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .trace_read_requests
             .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
     }
 
     pub fn get_min_resident_size_override(&self) -> Option<u64> {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
             .min_resident_size_override
             .or(self.conf.default_tenant_conf.min_resident_size_override)
     }
 
     pub fn get_heatmap_period(&self) -> Option<Duration> {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         let heatmap_period = tenant_conf
             .heatmap_period
             .unwrap_or(self.conf.default_tenant_conf.heatmap_period);
@@ -2334,26 +2327,40 @@ impl Tenant {
     }
 
     pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
-        self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
-        self.tenant_conf_updated();
+        // Use read-copy-update in order to avoid overwriting the location config
+        // state if this races with [`Tenant::set_new_location_config`]. Note that
+        // this race is not possible if both request types come from the storage
+        // controller (as they should!) because an exclusive op lock is required
+        // on the storage controller side.
+        self.tenant_conf.rcu(|inner| {
+            Arc::new(AttachedTenantConf {
+                tenant_conf: new_tenant_conf.clone(),
+                location: inner.location,
+            })
+        });
+
+        self.tenant_conf_updated(&new_tenant_conf);
         // Don't hold self.timelines.lock() during the notifies.
         // There's no risk of deadlock right now, but there could be if we consolidate
         // mutexes in struct Timeline in the future.
         let timelines = self.list_timelines();
         for timeline in timelines {
-            timeline.tenant_conf_updated();
+            timeline.tenant_conf_updated(&new_tenant_conf);
         }
     }
 
     pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
-        *self.tenant_conf.write().unwrap() = new_conf;
-        self.tenant_conf_updated();
+        let new_tenant_conf = new_conf.tenant_conf.clone();
+
+        self.tenant_conf.store(Arc::new(new_conf));
+
+        self.tenant_conf_updated(&new_tenant_conf);
         // Don't hold self.timelines.lock() during the notifies.
         // There's no risk of deadlock right now, but there could be if we consolidate
         // mutexes in struct Timeline in the future.
         let timelines = self.list_timelines();
         for timeline in timelines {
-            timeline.tenant_conf_updated();
+            timeline.tenant_conf_updated(&new_tenant_conf);
         }
     }
 
@@ -2367,11 +2374,8 @@ impl Tenant {
             .unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone())
     }
 
-    pub(crate) fn tenant_conf_updated(&self) {
-        let conf = {
-            let guard = self.tenant_conf.read().unwrap();
-            Self::get_timeline_get_throttle_config(self.conf, &guard.tenant_conf)
-        };
+    pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
+        let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf);
         self.timeline_get_throttle.reconfigure(conf)
     }
 
@@ -2519,7 +2523,7 @@ impl Tenant {
                 Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
                 &crate::metrics::tenant_throttling::TIMELINE_GET,
             )),
-            tenant_conf: Arc::new(RwLock::new(attached_conf)),
+            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
         }
     }
 
@@ -3505,7 +3509,7 @@ impl Tenant {
     }
 
     pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
-        self.tenant_conf.read().unwrap().tenant_conf.clone()
+        self.tenant_conf.load().tenant_conf.clone()
     }
 }
 
@@ -3854,6 +3858,7 @@ mod tests {
     use hex_literal::hex;
     use pageserver_api::keyspace::KeySpace;
     use rand::{thread_rng, Rng};
+    use tests::timeline::ShutdownMode;
 
     static TEST_KEY: Lazy<Key> =
         Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -4299,7 +4304,7 @@ mod tests {
             make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
             // so that all uploads finish & we can call harness.load() below again
             tenant
-                .shutdown(Default::default(), true)
+                .shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
                 .instrument(harness.span())
                 .await
                 .ok()
@@ -4340,7 +4345,7 @@ mod tests {
 
             // so that all uploads finish & we can call harness.load() below again
             tenant
-                .shutdown(Default::default(), true)
+                .shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
                 .instrument(harness.span())
                 .await
                 .ok()
@@ -5121,7 +5126,7 @@ mod tests {
             // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
             let raw_tline = tline.raw_timeline().unwrap();
             raw_tline
-                .shutdown()
+                .shutdown(super::timeline::ShutdownMode::Hard)
                 .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
                 .await;
             std::mem::forget(tline);
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 7d37873a67..d1881f3897 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -14,7 +14,10 @@ use crate::{
     config::PageServerConf,
     context::RequestContext,
     task_mgr::{self, TaskKind},
-    tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
+    tenant::{
+        mgr::{TenantSlot, TenantsMapRemoveResult},
+        timeline::ShutdownMode,
+    },
 };
 
 use super::{
@@ -463,7 +466,7 @@ impl DeleteTenantFlow {
         // tenant.shutdown
         // Its also bad that we're holding tenants.read here.
         // TODO relax set_stopping to be idempotent?
-        if tenant.shutdown(progress, false).await.is_err() {
+        if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() {
             return Err(DeleteTenantError::Other(anyhow::anyhow!(
                 "tenant shutdown is already in progress"
             )));
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index e48b9e83bd..b27230db03 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -72,6 +72,10 @@ impl EphemeralFile {
         self.len
     }
 
+    pub(crate) fn id(&self) -> page_cache::FileId {
+        self.page_cache_file_id
+    }
+
     pub(crate) async fn read_blk(
         &self,
         blknum: u32,
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index b8ed69052f..4c4cd90c99 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -346,35 +346,6 @@ where
     }
 }
 
-#[derive(PartialEq, Eq, Hash, Debug, Clone)]
-pub enum InMemoryLayerHandle {
-    Open {
-        lsn_floor: Lsn,
-        end_lsn: Lsn,
-    },
-    Frozen {
-        idx: usize,
-        lsn_floor: Lsn,
-        end_lsn: Lsn,
-    },
-}
-
-impl InMemoryLayerHandle {
-    pub fn get_lsn_floor(&self) -> Lsn {
-        match self {
-            InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor,
-            InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
-        }
-    }
-
-    pub fn get_end_lsn(&self) -> Lsn {
-        match self {
-            InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn,
-            InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn,
-        }
-    }
-}
-
 impl LayerMap {
     ///
     /// Find the latest layer (by lsn.end) that covers the given
@@ -576,41 +547,18 @@ impl LayerMap {
         self.historic.iter()
     }
 
-    /// Get a handle for the first in memory layer that matches the provided predicate.
-    /// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer.
-    ///
-    /// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during
-    /// the same exclusive region established by holding the layer manager lock.
-    pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
+    /// Get a ref counted pointer for the first in memory layer that matches the provided predicate.
+    pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<Arc<InMemoryLayer>>
     where
         Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
     {
         if let Some(open) = &self.open_layer {
             if pred(open) {
-                return Some(InMemoryLayerHandle::Open {
-                    lsn_floor: open.get_lsn_range().start,
-                    end_lsn: open.get_lsn_range().end,
-                });
+                return Some(open.clone());
             }
         }
 
-        let pos = self.frozen_layers.iter().rev().position(pred);
-        pos.map(|rev_idx| {
-            let idx = self.frozen_layers.len() - 1 - rev_idx;
-            InMemoryLayerHandle::Frozen {
-                idx,
-                lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
-                end_lsn: self.frozen_layers[idx].get_lsn_range().end,
-            }
-        })
-    }
-
-    /// Get the layer pointed to by the provided handle.
-    pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
-        match handle {
-            InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
-            InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
-        }
+        self.frozen_layers.iter().rfind(|l| pred(l)).cloned()
     }
 
     ///
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index f01fb9791c..b1b46d487b 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -44,6 +44,7 @@ use crate::tenant::config::{
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
+use crate::tenant::timeline::ShutdownMode;
 use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
 
@@ -783,11 +784,9 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
                             shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone()));
                             join_set.spawn(
                                 async move {
-                                    let freeze_and_flush = true;
-
                                     let res = {
                                         let (_guard, shutdown_progress) = completion::channel();
-                                        t.shutdown(shutdown_progress, freeze_and_flush).await
+                                        t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await
                                     };
 
                                     if let Err(other_progress) = res {
@@ -1107,7 +1106,7 @@ impl TenantManager {
                 };
 
                 info!("Shutting down attached tenant");
-                match tenant.shutdown(progress, false).await {
+                match tenant.shutdown(progress, ShutdownMode::Hard).await {
                     Ok(()) => {}
                     Err(barrier) => {
                         info!("Shutdown already in progress, waiting for it to complete");
@@ -1223,7 +1222,7 @@ impl TenantManager {
                     TenantSlot::Attached(tenant) => {
                         let (_guard, progress) = utils::completion::channel();
                         info!("Shutting down just-spawned tenant, because tenant manager is shut down");
-                        match tenant.shutdown(progress, false).await {
+                        match tenant.shutdown(progress, ShutdownMode::Hard).await {
                             Ok(()) => {
                                 info!("Finished shutting down just-spawned tenant");
                             }
@@ -1273,7 +1272,7 @@ impl TenantManager {
         };
 
         let (_guard, progress) = utils::completion::channel();
-        match tenant.shutdown(progress, false).await {
+        match tenant.shutdown(progress, ShutdownMode::Hard).await {
             Ok(()) => {
                 slot_guard.drop_old_value()?;
             }
@@ -1649,7 +1648,14 @@ impl TenantManager {
                     fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
                         "failpoint"
                     )));
-                    if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
+                    if let Err(e) = timeline
+                        .wait_lsn(
+                            *target_lsn,
+                            crate::tenant::timeline::WaitLsnWaiter::Tenant,
+                            ctx,
+                        )
+                        .await
+                    {
                         // Failure here might mean shutdown, in any case this part is an optimization
                         // and we shouldn't hold up the split operation.
                         tracing::warn!(
@@ -1670,7 +1676,7 @@ impl TenantManager {
 
         // Phase 5: Shut down the parent shard, and erase it from disk
         let (_guard, progress) = completion::channel();
-        match parent.shutdown(progress, false).await {
+        match parent.shutdown(progress, ShutdownMode::Hard).await {
             Ok(()) => {}
             Err(other) => {
                 other.wait().await;
@@ -2657,11 +2663,11 @@ where
     let attached_tenant = match slot_guard.get_old_value() {
         Some(TenantSlot::Attached(tenant)) => {
             // whenever we remove a tenant from memory, we don't want to flush and wait for upload
-            let freeze_and_flush = false;
+            let shutdown_mode = ShutdownMode::Hard;
 
             // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
             // that we can continue safely to cleanup.
-            match tenant.shutdown(progress, freeze_and_flush).await {
+            match tenant.shutdown(progress, shutdown_mode).await {
                 Ok(()) => {}
                 Err(_other) => {
                     // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index cbd942d706..3879135f26 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -200,6 +200,7 @@ use utils::backoff::{
 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
+use std::time::Duration;
 
 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use std::ops::DerefMut;
@@ -207,7 +208,7 @@ use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
 
-use crate::deletion_queue::DeletionQueueClient;
+use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::metrics::{
     MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
     RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
@@ -261,6 +262,10 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
 /// Default buffer size when interfacing with [`tokio::fs::File`].
 pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
 
+/// Doing non-essential flushes of deletion queue is subject to this timeout, after
+/// which we warn and skip.
+const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10);
+
 pub enum MaybeDeletedIndexPart {
     IndexPart(IndexPart),
     Deleted(IndexPart),
@@ -588,14 +593,14 @@ impl RemoteTimelineClient {
         upload_queue: &mut UploadQueueInitialized,
         metadata: TimelineMetadata,
     ) {
+        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
+
         info!(
-            "scheduling metadata upload with {} files ({} changed)",
+            "scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)",
             upload_queue.latest_files.len(),
             upload_queue.latest_files_changes_since_metadata_upload_scheduled,
         );
 
-        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
-
         let index_part = IndexPart::new(
             upload_queue.latest_files.clone(),
             disk_consistent_lsn,
@@ -1050,6 +1055,26 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> {
+        match tokio::time::timeout(
+            DELETION_QUEUE_FLUSH_TIMEOUT,
+            self.deletion_queue_client.flush_immediate(),
+        )
+        .await
+        {
+            Ok(result) => result,
+            Err(_timeout) => {
+                // Flushing remote deletions is not mandatory: we flush here to make the system easier to test, and
+                // to ensure that _usually_ objects are really gone after a DELETE is acked.  However, in case of deletion
+                // queue issues (https://github.com/neondatabase/neon/issues/6440), we don't want to wait indefinitely here.
+                tracing::warn!(
+                    "Timed out waiting for deletion queue flush, acking deletion anyway"
+                );
+                Ok(())
+            }
+        }
+    }
+
     /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
     /// The function deletes layer files one by one, then lists the prefix to see if we leaked something
     /// deletes leaked files if any and proceeds with deletion of index file at the end.
@@ -1099,7 +1124,7 @@ impl RemoteTimelineClient {
 
         // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
         // taking the burden of listing all the layers that we already know we should delete.
-        self.deletion_queue_client.flush_immediate().await?;
+        self.flush_deletion_queue().await?;
 
         let cancel = shutdown_token();
 
@@ -1173,7 +1198,7 @@ impl RemoteTimelineClient {
 
         // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
         // for a flush to a persistent deletion list so that we may be sure deletion will occur.
-        self.deletion_queue_client.flush_immediate().await?;
+        self.flush_deletion_queue().await?;
 
         fail::fail_point!("timeline-delete-after-index-delete", |_| {
             Err(anyhow::anyhow!(
@@ -1569,7 +1594,7 @@ impl RemoteTimelineClient {
     /// Use [`RemoteTimelineClient::shutdown`] for graceful stop.
     ///
     /// In-progress operations will still be running after this function returns.
-    /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
+    /// Use `task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(timeline_id))`
     /// to wait for them to complete, after calling this function.
     pub(crate) fn stop(&self) {
         // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 8782a9f04e..530e1a3244 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -786,6 +786,35 @@ impl<'a> TenantDownloader<'a> {
             // Existing on-disk layers: just update their access time.
             if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
                 tracing::debug!("Layer {} is already on disk", layer.name);
+
+                if cfg!(debug_assertions) {
+                    // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
+                    // are already present on disk are really there.
+                    let local_path = self
+                        .conf
+                        .timeline_path(tenant_shard_id, &timeline.timeline_id)
+                        .join(layer.name.file_name());
+                    match tokio::fs::metadata(&local_path).await {
+                        Ok(meta) => {
+                            tracing::debug!(
+                                "Layer {} present at {}, size {}",
+                                layer.name,
+                                local_path,
+                                meta.len(),
+                            );
+                        }
+                        Err(e) => {
+                            tracing::warn!(
+                                "Layer {} not found at {} ({})",
+                                layer.name,
+                                local_path,
+                                e
+                            );
+                            debug_assert!(false);
+                        }
+                    }
+                }
+
                 if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
                     || on_disk.access_time != layer.access_time
                 {
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index f44a92a2d7..9a2b086828 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -25,7 +25,7 @@ use std::cmp::{Ordering, Reverse};
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
 use std::ops::Range;
-use std::sync::Mutex;
+use std::sync::{Arc, Mutex};
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
@@ -41,8 +41,8 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
 
 pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
 
-use super::layer_map::InMemoryLayerHandle;
-use super::timeline::layer_manager::LayerManager;
+use self::inmemory_layer::InMemoryLayerFileId;
+
 use super::timeline::GetVectoredError;
 use super::PageReconstructError;
 
@@ -204,23 +204,30 @@ impl Default for ValuesReconstructState {
     }
 }
 
-/// Description of layer to be read - the layer map can turn
-/// this description into the actual layer.
-#[derive(PartialEq, Eq, Hash, Debug, Clone)]
-pub(crate) enum ReadableLayerDesc {
-    Persistent {
-        desc: PersistentLayerDesc,
-        lsn_range: Range<Lsn>,
-    },
-    InMemory {
-        handle: InMemoryLayerHandle,
-        lsn_ceil: Lsn,
-    },
+/// A key that uniquely identifies a layer in a timeline
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
+pub(crate) enum LayerId {
+    PersitentLayerId(PersistentLayerKey),
+    InMemoryLayerId(InMemoryLayerFileId),
 }
 
-/// Wraper for 'ReadableLayerDesc' sorted by Lsn
+/// Layer wrapper for the read path. Note that it is valid
+/// to use these layers even after external operations have
+/// been performed on them (compaction, freeze, etc.).
 #[derive(Debug)]
-struct ReadableLayerDescOrdered(ReadableLayerDesc);
+pub(crate) enum ReadableLayer {
+    PersistentLayer(Layer),
+    InMemoryLayer(Arc<InMemoryLayer>),
+}
+
+/// A partial description of a read to be done.
+#[derive(Debug, Clone)]
+struct ReadDesc {
+    /// An id used to resolve the readable layer within the fringe
+    layer_id: LayerId,
+    /// Lsn range for the read, used for selecting the next read
+    lsn_range: Range<Lsn>,
+}
 
 /// Data structure which maintains a fringe of layers for the
 /// read path. The fringe is the set of layers which intersects
@@ -231,41 +238,64 @@ struct ReadableLayerDescOrdered(ReadableLayerDesc);
 /// a two layer indexing scheme.
 #[derive(Debug)]
 pub(crate) struct LayerFringe {
-    layers_by_lsn: BinaryHeap<ReadableLayerDescOrdered>,
-    layers: HashMap<ReadableLayerDesc, KeySpace>,
+    planned_reads_by_lsn: BinaryHeap<ReadDesc>,
+    layers: HashMap<LayerId, LayerKeyspace>,
+}
+
+#[derive(Debug)]
+struct LayerKeyspace {
+    layer: ReadableLayer,
+    target_keyspace: KeySpace,
 }
 
 impl LayerFringe {
     pub(crate) fn new() -> Self {
         LayerFringe {
-            layers_by_lsn: BinaryHeap::new(),
+            planned_reads_by_lsn: BinaryHeap::new(),
             layers: HashMap::new(),
         }
     }
 
-    pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
-        let handle = match self.layers_by_lsn.pop() {
-            Some(h) => h,
+    pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
+        let read_desc = match self.planned_reads_by_lsn.pop() {
+            Some(desc) => desc,
             None => return None,
         };
 
-        let removed = self.layers.remove_entry(&handle.0);
+        let removed = self.layers.remove_entry(&read_desc.layer_id);
         match removed {
-            Some((layer, keyspace)) => Some((layer, keyspace)),
+            Some((
+                _,
+                LayerKeyspace {
+                    layer,
+                    target_keyspace,
+                },
+            )) => Some((layer, target_keyspace, read_desc.lsn_range)),
             None => unreachable!("fringe internals are always consistent"),
         }
     }
 
-    pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
-        let entry = self.layers.entry(layer.clone());
+    pub(crate) fn update(
+        &mut self,
+        layer: ReadableLayer,
+        keyspace: KeySpace,
+        lsn_range: Range<Lsn>,
+    ) {
+        let layer_id = layer.id();
+        let entry = self.layers.entry(layer_id.clone());
         match entry {
             Entry::Occupied(mut entry) => {
-                entry.get_mut().merge(&keyspace);
+                entry.get_mut().target_keyspace.merge(&keyspace);
             }
             Entry::Vacant(entry) => {
-                self.layers_by_lsn
-                    .push(ReadableLayerDescOrdered(entry.key().clone()));
-                entry.insert(keyspace);
+                self.planned_reads_by_lsn.push(ReadDesc {
+                    lsn_range,
+                    layer_id: layer_id.clone(),
+                });
+                entry.insert(LayerKeyspace {
+                    layer,
+                    target_keyspace: keyspace,
+                });
             }
         }
     }
@@ -277,77 +307,55 @@ impl Default for LayerFringe {
     }
 }
 
-impl Ord for ReadableLayerDescOrdered {
+impl Ord for ReadDesc {
     fn cmp(&self, other: &Self) -> Ordering {
-        let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil());
+        let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
         if ord == std::cmp::Ordering::Equal {
-            self.0
-                .get_lsn_floor()
-                .cmp(&other.0.get_lsn_floor())
-                .reverse()
+            self.lsn_range.start.cmp(&other.lsn_range.start).reverse()
         } else {
             ord
         }
     }
 }
 
-impl PartialOrd for ReadableLayerDescOrdered {
+impl PartialOrd for ReadDesc {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         Some(self.cmp(other))
     }
 }
 
-impl PartialEq for ReadableLayerDescOrdered {
+impl PartialEq for ReadDesc {
     fn eq(&self, other: &Self) -> bool {
-        self.0.get_lsn_floor() == other.0.get_lsn_floor()
-            && self.0.get_lsn_ceil() == other.0.get_lsn_ceil()
+        self.lsn_range == other.lsn_range
     }
 }
 
-impl Eq for ReadableLayerDescOrdered {}
+impl Eq for ReadDesc {}
 
-impl ReadableLayerDesc {
-    pub(crate) fn get_lsn_floor(&self) -> Lsn {
+impl ReadableLayer {
+    pub(crate) fn id(&self) -> LayerId {
         match self {
-            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start,
-            ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
-        }
-    }
-
-    pub(crate) fn get_lsn_ceil(&self) -> Lsn {
-        match self {
-            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end,
-            ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
+            Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()),
+            Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()),
         }
     }
 
     pub(crate) async fn get_values_reconstruct_data(
         &self,
-        layer_manager: &LayerManager,
         keyspace: KeySpace,
+        lsn_range: Range<Lsn>,
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
         match self {
-            ReadableLayerDesc::Persistent { desc, lsn_range } => {
-                let layer = layer_manager.get_from_desc(desc);
+            ReadableLayer::PersistentLayer(layer) => {
                 layer
-                    .get_values_reconstruct_data(
-                        keyspace,
-                        lsn_range.clone(),
-                        reconstruct_state,
-                        ctx,
-                    )
+                    .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
                     .await
             }
-            ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
-                let layer = layer_manager
-                    .layer_map()
-                    .get_in_memory_layer(handle)
-                    .unwrap();
-
+            ReadableLayer::InMemoryLayer(layer) => {
                 layer
-                    .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
                     .await
             }
         }
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 628f12065f..43942ba2db 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -12,7 +12,7 @@ use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::ValueReconstructResult;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{PageReconstructError, Timeline};
-use crate::walrecord;
+use crate::{page_cache, walrecord};
 use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
@@ -36,10 +36,14 @@ use super::{
     ValuesReconstructState,
 };
 
+#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
+pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
+
 pub struct InMemoryLayer {
     conf: &'static PageServerConf,
     tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
+    file_id: InMemoryLayerFileId,
 
     /// This layer contains all the changes from 'start_lsn'. The
     /// start is inclusive.
@@ -200,6 +204,10 @@ pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
 };
 
 impl InMemoryLayer {
+    pub(crate) fn file_id(&self) -> InMemoryLayerFileId {
+        self.file_id
+    }
+
     pub(crate) fn get_timeline_id(&self) -> TimelineId {
         self.timeline_id
     }
@@ -443,8 +451,10 @@ impl InMemoryLayer {
         trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
 
         let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
+        let key = InMemoryLayerFileId(file.id());
 
         Ok(InMemoryLayer {
+            file_id: key,
             conf,
             timeline_id,
             tenant_shard_id,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f3565c1fb3..d3c8c5f66c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -9,6 +9,7 @@ pub mod uninit;
 mod walreceiver;
 
 use anyhow::{anyhow, bail, ensure, Context, Result};
+use arc_swap::ArcSwap;
 use bytes::Bytes;
 use camino::Utf8Path;
 use enumset::EnumSet;
@@ -118,11 +119,11 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
-use super::remote_timeline_client::RemoteTimelineClient;
+use super::config::TenantConf;
 use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
-use super::{config::TenantConf, storage_layer::ReadableLayerDesc};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
+use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(super) enum FlushLoopState {
@@ -183,7 +184,7 @@ pub(crate) struct AuxFilesState {
 
 pub struct Timeline {
     conf: &'static PageServerConf,
-    tenant_conf: Arc<RwLock<AttachedTenantConf>>,
+    tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
 
     myself: Weak<Self>,
 
@@ -281,10 +282,12 @@ pub struct Timeline {
     pub(super) flush_loop_state: Mutex<FlushLoopState>,
 
     /// layer_flush_start_tx can be used to wake up the layer-flushing task.
-    /// The value is a counter, incremented every time a new flush cycle is requested.
-    /// The flush cycle counter is sent back on the layer_flush_done channel when
-    /// the flush finishes. You can use that to wait for the flush to finish.
-    layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
+    /// - The u64 value is a counter, incremented every time a new flush cycle is requested.
+    ///   The flush cycle counter is sent back on the layer_flush_done channel when
+    ///   the flush finishes. You can use that to wait for the flush to finish.
+    /// - The LSN is updated to max() of its current value and the latest disk_consistent_lsn
+    ///   read by whoever sends an update
+    layer_flush_start_tx: tokio::sync::watch::Sender<(u64, Lsn)>,
     /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
     layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,
 
@@ -612,6 +615,25 @@ pub enum GetVectoredImpl {
     Vectored,
 }
 
+pub(crate) enum WaitLsnWaiter<'a> {
+    Timeline(&'a Timeline),
+    Tenant,
+    PageService,
+}
+
+/// Argument to [`Timeline::shutdown`].
+#[derive(Debug, Clone, Copy)]
+pub(crate) enum ShutdownMode {
+    /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
+    /// also to remote storage.  This method can easily take multiple seconds for a busy timeline.
+    ///
+    /// While we are flushing, we continue to accept read I/O for LSNs ingested before
+    /// the call to [`Timeline::shutdown`].
+    FreezeAndFlush,
+    /// Shut down immediately, without waiting for any open layers to flush.
+    Hard,
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
@@ -1060,7 +1082,8 @@ impl Timeline {
     pub(crate) async fn wait_lsn(
         &self,
         lsn: Lsn,
-        _ctx: &RequestContext, /* Prepare for use by cancellation */
+        who_is_waiting: WaitLsnWaiter<'_>,
+        ctx: &RequestContext, /* Prepare for use by cancellation */
     ) -> Result<(), WaitLsnError> {
         if self.cancel.is_cancelled() {
             return Err(WaitLsnError::Shutdown);
@@ -1068,20 +1091,28 @@ impl Timeline {
             return Err(WaitLsnError::BadState);
         }
 
-        // This should never be called from the WAL receiver, because that could lead
-        // to a deadlock.
-        debug_assert!(
-            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
-            "wait_lsn cannot be called in WAL receiver"
-        );
-        debug_assert!(
-            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
-            "wait_lsn cannot be called in WAL receiver"
-        );
-        debug_assert!(
-            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
-            "wait_lsn cannot be called in WAL receiver"
-        );
+        if cfg!(debug_assertions) {
+            match ctx.task_kind() {
+                TaskKind::WalReceiverManager
+                | TaskKind::WalReceiverConnectionHandler
+                | TaskKind::WalReceiverConnectionPoller => {
+                    let is_myself = match who_is_waiting {
+                        WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself),
+                        WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()),
+                    };
+                    if is_myself {
+                        if let Err(current) = self.last_record_lsn.would_wait_for(lsn) {
+                            // walingest is the only one that can advance last_record_lsn; it should make sure to never reach here
+                            panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock");
+                        }
+                    } else {
+                        // if another  timeline's  is waiting for us, there's no deadlock risk because
+                        // our walreceiver task can make progress independent of theirs
+                    }
+                }
+                _ => {}
+            }
+        }
 
         let _timer = crate::metrics::WAIT_LSN_TIME.start_timer();
 
@@ -1140,8 +1171,8 @@ impl Timeline {
     /// Flush to disk all data that was written with the put_* functions
     #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
     pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
-        self.freeze_inmem_layer(false).await;
-        self.flush_frozen_layers_and_wait().await
+        let to_lsn = self.freeze_inmem_layer(false).await;
+        self.flush_frozen_layers_and_wait(to_lsn).await
     }
 
     /// If there is no writer, and conditions for rolling the latest layer are met, then freeze it.
@@ -1161,7 +1192,39 @@ impl Timeline {
         };
 
         let Some(open_layer) = &layers_guard.layer_map().open_layer else {
-            // No open layer, no work to do.
+            // If there is no open layer, we have no layer freezing to do.  However, we might need to generate
+            // some updates to disk_consistent_lsn and remote_consistent_lsn, in case we ingested some WAL regions
+            // that didn't result in writes to this shard.
+
+            // Must not hold the layers lock while waiting for a flush.
+            drop(layers_guard);
+
+            let last_record_lsn = self.get_last_record_lsn();
+            let disk_consistent_lsn = self.get_disk_consistent_lsn();
+            if last_record_lsn > disk_consistent_lsn {
+                // We have no open layer, but disk_consistent_lsn is behind the last record: this indicates
+                // we are a sharded tenant and have skipped some WAL
+                let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
+                if last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
+                    // This should be somewhat rare, so we log it at INFO level.
+                    //
+                    // We checked for checkpoint timeout so that a shard without any
+                    // data ingested (yet) doesn't write a remote index as soon as it
+                    // sees its LSN advance: we only do this if we've been layer-less
+                    // for some time.
+                    tracing::info!(
+                        "Advancing disk_consistent_lsn past WAL ingest gap {} -> {}",
+                        disk_consistent_lsn,
+                        last_record_lsn
+                    );
+
+                    // The flush loop will update remote consistent LSN as well as disk consistent LSN.
+                    self.flush_frozen_layers_and_wait(last_record_lsn)
+                        .await
+                        .ok();
+                }
+            }
+
             return;
         };
 
@@ -1290,83 +1353,119 @@ impl Timeline {
         self.launch_eviction_task(parent, background_jobs_can_start);
     }
 
-    /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
-    /// also to remote storage.  This method can easily take multiple seconds for a busy timeline.
+    /// After this function returns, there are no timeline-scoped tasks are left running.
     ///
-    /// While we are flushing, we continue to accept read I/O.
-    pub(crate) async fn flush_and_shutdown(&self) {
+    /// The preferred pattern for is:
+    /// - in any spawned tasks, keep Timeline::guard open + Timeline::cancel / child token
+    /// - if early shutdown (not just cancellation) of a sub-tree of tasks is required,
+    ///   go the extra mile and keep track of JoinHandles
+    /// - Keep track of JoinHandles using a passed-down `Arc<Mutex<Option<JoinSet>>>` or similar,
+    ///   instead of spawning directly on a runtime. It is a more composable / testable pattern.
+    ///
+    /// For legacy reasons, we still have multiple tasks spawned using
+    /// `task_mgr::spawn(X, Some(tenant_id), Some(timeline_id))`.
+    /// We refer to these as "timeline-scoped task_mgr tasks".
+    /// Some of these tasks are already sensitive to Timeline::cancel while others are
+    /// not sensitive to Timeline::cancel and instead respect [`task_mgr::shutdown_token`]
+    /// or [`task_mgr::shutdown_watcher`].
+    /// We want to gradually convert the code base away from these.
+    ///
+    /// Here is an inventory of timeline-scoped task_mgr tasks that are still sensitive to
+    /// `task_mgr::shutdown_{token,watcher}` (there are also tenant-scoped and global-scoped
+    /// ones that aren't mentioned here):
+    /// - [`TaskKind::TimelineDeletionWorker`]
+    ///    - NB: also used for tenant deletion
+    /// - [`TaskKind::RemoteUploadTask`]`
+    /// - [`TaskKind::InitialLogicalSizeCalculation`]
+    /// - [`TaskKind::DownloadAllRemoteLayers`] (can we get rid of it?)
+    // Inventory of timeline-scoped task_mgr tasks that use spawn but aren't sensitive:
+    /// - [`TaskKind::Eviction`]
+    /// - [`TaskKind::LayerFlushTask`]
+    /// - [`TaskKind::OndemandLogicalSizeCalculation`]
+    /// - [`TaskKind::GarbageCollector`] (immediate_gc is timeline-scoped)
+    pub(crate) async fn shutdown(&self, mode: ShutdownMode) {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
-        // Stop ingesting data, so that we are not still writing to an InMemoryLayer while
-        // trying to flush
-        tracing::debug!("Waiting for WalReceiverManager...");
-        task_mgr::shutdown_tasks(
-            Some(TaskKind::WalReceiverManager),
-            Some(self.tenant_shard_id),
-            Some(self.timeline_id),
-        )
-        .await;
+        let try_freeze_and_flush = match mode {
+            ShutdownMode::FreezeAndFlush => true,
+            ShutdownMode::Hard => false,
+        };
 
-        // Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance
+        // Regardless of whether we're going to try_freeze_and_flush
+        // or not, stop ingesting any more data. Walreceiver only provides
+        // cancellation but no "wait until gone", because it uses the Timeline::gate.
+        // So, only after the self.gate.close() below will we know for sure that
+        // no walreceiver tasks are left.
+        // For `try_freeze_and_flush=true`, this means that we might still be ingesting
+        // data during the call to `self.freeze_and_flush()` below.
+        // That's not ideal, but, we don't have the concept of a ChildGuard,
+        // which is what we'd need to properly model early shutdown of the walreceiver
+        // task sub-tree before the other Timeline task sub-trees.
+        let walreceiver = self.walreceiver.lock().unwrap().take();
+        tracing::debug!(
+            is_some = walreceiver.is_some(),
+            "Waiting for WalReceiverManager..."
+        );
+        if let Some(walreceiver) = walreceiver {
+            walreceiver.cancel();
+        }
+        // ... and inform any waiters for newer LSNs that there won't be any.
         self.last_record_lsn.shutdown();
 
-        // now all writers to InMemory layer are gone, do the final flush if requested
-        match self.freeze_and_flush().await {
-            Ok(_) => {
-                // drain the upload queue
-                if let Some(client) = self.remote_client.as_ref() {
-                    // if we did not wait for completion here, it might be our shutdown process
-                    // didn't wait for remote uploads to complete at all, as new tasks can forever
-                    // be spawned.
-                    //
-                    // what is problematic is the shutting down of RemoteTimelineClient, because
-                    // obviously it does not make sense to stop while we wait for it, but what
-                    // about corner cases like s3 suddenly hanging up?
-                    client.shutdown().await;
+        if try_freeze_and_flush {
+            // we shut down walreceiver above, so, we won't add anything more
+            // to the InMemoryLayer; freeze it and wait for all frozen layers
+            // to reach the disk & upload queue, then shut the upload queue and
+            // wait for it to drain.
+            match self.freeze_and_flush().await {
+                Ok(_) => {
+                    // drain the upload queue
+                    if let Some(client) = self.remote_client.as_ref() {
+                        // if we did not wait for completion here, it might be our shutdown process
+                        // didn't wait for remote uploads to complete at all, as new tasks can forever
+                        // be spawned.
+                        //
+                        // what is problematic is the shutting down of RemoteTimelineClient, because
+                        // obviously it does not make sense to stop while we wait for it, but what
+                        // about corner cases like s3 suddenly hanging up?
+                        client.shutdown().await;
+                    }
+                }
+                Err(e) => {
+                    // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
+                    // we have some extra WAL replay to do next time the timeline starts.
+                    warn!("failed to freeze and flush: {e:#}");
                 }
             }
-            Err(e) => {
-                // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
-                // we have some extra WAL replay to do next time the timeline starts.
-                warn!("failed to freeze and flush: {e:#}");
-            }
         }
 
-        self.shutdown().await;
-    }
-
-    /// Shut down immediately, without waiting for any open layers to flush to disk.  This is a subset of
-    /// the graceful [`Timeline::flush_and_shutdown`] function.
-    pub(crate) async fn shutdown(&self) {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-
         // Signal any subscribers to our cancellation token to drop out
         tracing::debug!("Cancelling CancellationToken");
         self.cancel.cancel();
 
-        // Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
-        // while doing so.
-        self.last_record_lsn.shutdown();
-
-        // Shut down the layer flush task before the remote client, as one depends on the other
-        task_mgr::shutdown_tasks(
-            Some(TaskKind::LayerFlushTask),
-            Some(self.tenant_shard_id),
-            Some(self.timeline_id),
-        )
-        .await;
-
-        // Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
-        // case our caller wants to use that for a deletion
+        // Transition the remote_client into a state where it's only useful for timeline deletion.
+        // (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
         if let Some(remote_client) = self.remote_client.as_ref() {
             remote_client.stop();
+            // As documented in remote_client.stop()'s doc comment, it's our responsibility
+            // to shut down the upload queue tasks.
+            // TODO: fix that, task management should be encapsulated inside remote_client.
+            task_mgr::shutdown_tasks(
+                Some(TaskKind::RemoteUploadTask),
+                Some(self.tenant_shard_id),
+                Some(self.timeline_id),
+            )
+            .await;
         }
 
+        // TODO: work toward making this a no-op. See this funciton's doc comment for more context.
         tracing::debug!("Waiting for tasks...");
-
         task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await;
 
-        // Finally wait until any gate-holders are complete
+        // Finally wait until any gate-holders are complete.
+        //
+        // TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks
+        // and use a TBD variant of shutdown_tasks that asserts that there were no tasks left.
         self.gate.close().await;
 
         self.metrics.shutdown();
@@ -1570,57 +1669,65 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
 // Private functions
 impl Timeline {
     pub(crate) fn get_lazy_slru_download(&self) -> bool {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
         tenant_conf
+            .tenant_conf
             .lazy_slru_download
             .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
     }
 
     fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
         tenant_conf
+            .tenant_conf
             .checkpoint_distance
             .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
     }
 
     fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
         tenant_conf
+            .tenant_conf
             .checkpoint_timeout
             .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
     }
 
     fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
         tenant_conf
+            .tenant_conf
             .compaction_target_size
             .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
     }
 
     fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
         tenant_conf
+            .tenant_conf
             .compaction_threshold
             .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
     }
 
     fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
         tenant_conf
+            .tenant_conf
             .image_creation_threshold
             .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
     }
 
     fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
-        let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = &self.tenant_conf.load();
         tenant_conf
+            .tenant_conf
             .compaction_algorithm
             .unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
     }
 
     fn get_eviction_policy(&self) -> EvictionPolicy {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
         tenant_conf
+            .tenant_conf
             .eviction_policy
             .unwrap_or(self.conf.default_tenant_conf.eviction_policy)
     }
@@ -1635,22 +1742,25 @@ impl Timeline {
     }
 
     fn get_image_layer_creation_check_threshold(&self) -> u8 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
-        tenant_conf.image_layer_creation_check_threshold.unwrap_or(
-            self.conf
-                .default_tenant_conf
-                .image_layer_creation_check_threshold,
-        )
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .image_layer_creation_check_threshold
+            .unwrap_or(
+                self.conf
+                    .default_tenant_conf
+                    .image_layer_creation_check_threshold,
+            )
     }
 
-    pub(super) fn tenant_conf_updated(&self) {
+    pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
         // NB: Most tenant conf options are read by background loops, so,
         // changes will automatically be picked up.
 
         // The threshold is embedded in the metric. So, we need to update it.
         {
             let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
-                &self.tenant_conf.read().unwrap().tenant_conf,
+                new_conf,
                 &self.conf.default_tenant_conf,
             );
 
@@ -1677,7 +1787,7 @@ impl Timeline {
     #[allow(clippy::too_many_arguments)]
     pub(super) fn new(
         conf: &'static PageServerConf,
-        tenant_conf: Arc<RwLock<AttachedTenantConf>>,
+        tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
         metadata: &TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
         timeline_id: TimelineId,
@@ -1693,17 +1803,16 @@ impl Timeline {
         let disk_consistent_lsn = metadata.disk_consistent_lsn();
         let (state, _) = watch::channel(state);
 
-        let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
+        let (layer_flush_start_tx, _) = tokio::sync::watch::channel((0, disk_consistent_lsn));
         let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
 
-        let tenant_conf_guard = tenant_conf.read().unwrap();
-
-        let evictions_low_residence_duration_metric_threshold =
+        let evictions_low_residence_duration_metric_threshold = {
+            let loaded_tenant_conf = tenant_conf.load();
             Self::get_evictions_low_residence_duration_metric_threshold(
-                &tenant_conf_guard.tenant_conf,
+                &loaded_tenant_conf.tenant_conf,
                 &conf.default_tenant_conf,
-            );
-        drop(tenant_conf_guard);
+            )
+        };
 
         Arc::new_cyclic(|myself| {
             let mut result = Timeline {
@@ -1886,20 +1995,19 @@ impl Timeline {
             self.timeline_id, self.tenant_shard_id
         );
 
-        let tenant_conf_guard = self.tenant_conf.read().unwrap();
-        let wal_connect_timeout = tenant_conf_guard
+        let tenant_conf = self.tenant_conf.load();
+        let wal_connect_timeout = tenant_conf
             .tenant_conf
             .walreceiver_connect_timeout
             .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
-        let lagging_wal_timeout = tenant_conf_guard
+        let lagging_wal_timeout = tenant_conf
             .tenant_conf
             .lagging_wal_timeout
             .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
-        let max_lsn_wal_lag = tenant_conf_guard
+        let max_lsn_wal_lag = tenant_conf
             .tenant_conf
             .max_lsn_wal_lag
             .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
-        drop(tenant_conf_guard);
 
         let mut guard = self.walreceiver.lock().unwrap();
         assert!(
@@ -2447,10 +2555,6 @@ impl Timeline {
                 debug!("cancelling logical size calculation for timeline shutdown");
                 calculation.await
             }
-            _ = task_mgr::shutdown_watcher() => {
-                debug!("cancelling logical size calculation for task shutdown");
-                calculation.await
-            }
         }
     }
 
@@ -2905,16 +3009,6 @@ impl Timeline {
 
         let mut completed_keyspace = KeySpace::default();
 
-        // Hold the layer map whilst visiting the timeline to prevent
-        // compaction, eviction and flushes from rendering the layers unreadable.
-        //
-        // TODO: Do we actually need to do this? In theory holding on
-        // to [`tenant::storage_layer::Layer`] should be enough. However,
-        // [`Timeline::get`] also holds the lock during IO, so more investigation
-        // is needed.
-        let guard = timeline.layers.read().await;
-        let layers = guard.layer_map();
-
         loop {
             if cancel.is_cancelled() {
                 return Err(GetVectoredError::Cancelled);
@@ -2924,6 +3018,9 @@ impl Timeline {
             unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
             completed_keyspace.merge(&keys_done_last_step);
 
+            let guard = timeline.layers.read().await;
+            let layers = guard.layer_map();
+
             let in_memory_layer = layers.find_in_memory_layer(|l| {
                 let start_lsn = l.get_lsn_range().start;
                 cont_lsn > start_lsn
@@ -2931,12 +3028,11 @@ impl Timeline {
 
             match in_memory_layer {
                 Some(l) => {
+                    let lsn_range = l.get_lsn_range().start..cont_lsn;
                     fringe.update(
-                        ReadableLayerDesc::InMemory {
-                            handle: l,
-                            lsn_ceil: cont_lsn,
-                        },
+                        ReadableLayer::InMemoryLayer(l),
                         unmapped_keyspace.clone(),
+                        lsn_range,
                     );
                 }
                 None => {
@@ -2948,30 +3044,43 @@ impl Timeline {
                             .into_iter()
                             .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
                                 (
-                                    ReadableLayerDesc::Persistent {
-                                        desc: (*layer).clone(),
-                                        lsn_range: lsn_floor..cont_lsn,
-                                    },
+                                    ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
                                     keyspace_accum.to_keyspace(),
+                                    lsn_floor..cont_lsn,
                                 )
                             })
-                            .for_each(|(layer, keyspace)| fringe.update(layer, keyspace));
+                            .for_each(|(layer, keyspace, lsn_range)| {
+                                fringe.update(layer, keyspace, lsn_range)
+                            });
                     }
                 }
             }
 
-            if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() {
+            // It's safe to drop the layer map lock after planning the next round of reads.
+            // The fringe keeps readable handles for the layers which are safe to read even
+            // if layers were compacted or flushed.
+            //
+            // The more interesting consideration is: "Why is the read algorithm still correct
+            // if the layer map changes while it is operating?". Doing a vectored read on a
+            // timeline boils down to pushing an imaginary lsn boundary downwards for each range
+            // covered by the read. The layer map tells us how to move the lsn downwards for a
+            // range at *a particular point in time*. It is fine for the answer to be different
+            // at two different time points.
+            drop(guard);
+
+            if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
+                let next_cont_lsn = lsn_range.start;
                 layer_to_read
                     .get_values_reconstruct_data(
-                        &guard,
                         keyspace_to_read.clone(),
+                        lsn_range,
                         reconstruct_state,
                         ctx,
                     )
                     .await?;
 
                 unmapped_keyspace = keyspace_to_read;
-                cont_lsn = layer_to_read.get_lsn_floor();
+                cont_lsn = next_cont_lsn;
             } else {
                 break;
             }
@@ -3049,7 +3158,7 @@ impl Timeline {
             }
         }
         ancestor
-            .wait_lsn(self.ancestor_lsn, ctx)
+            .wait_lsn(self.ancestor_lsn, WaitLsnWaiter::Timeline(self), ctx)
             .await
             .map_err(|e| match e {
                 e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
@@ -3099,7 +3208,9 @@ impl Timeline {
         self.last_record_lsn.advance(new_lsn);
     }
 
-    async fn freeze_inmem_layer(&self, write_lock_held: bool) {
+    /// Whether there was a layer to freeze or not, return the value of get_last_record_lsn
+    /// before we attempted the freeze: this guarantees that ingested data is frozen up to this lsn (inclusive).
+    async fn freeze_inmem_layer(&self, write_lock_held: bool) -> Lsn {
         // Freeze the current open in-memory layer. It will be written to disk on next
         // iteration.
 
@@ -3109,7 +3220,9 @@ impl Timeline {
             Some(self.write_lock.lock().await)
         };
 
-        self.freeze_inmem_layer_at(self.get_last_record_lsn()).await;
+        let to_lsn = self.get_last_record_lsn();
+        self.freeze_inmem_layer_at(to_lsn).await;
+        to_lsn
     }
 
     async fn freeze_inmem_layer_at(&self, at: Lsn) {
@@ -3122,25 +3235,24 @@ impl Timeline {
     /// Layer flusher task's main loop.
     async fn flush_loop(
         self: &Arc<Self>,
-        mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>,
+        mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>,
         ctx: &RequestContext,
     ) {
         info!("started flush loop");
         loop {
             tokio::select! {
                 _ = self.cancel.cancelled() => {
-                    info!("shutting down layer flush task");
-                    break;
-                },
-                _ = task_mgr::shutdown_watcher() => {
-                    info!("shutting down layer flush task");
+                    info!("shutting down layer flush task due to Timeline::cancel");
                     break;
                 },
                 _ = layer_flush_start_rx.changed() => {}
             }
-
             trace!("waking up");
-            let flush_counter = *layer_flush_start_rx.borrow();
+            let (flush_counter, frozen_to_lsn) = *layer_flush_start_rx.borrow();
+
+            // The highest LSN to which we flushed in the loop over frozen layers
+            let mut flushed_to_lsn = Lsn(0);
+
             let result = loop {
                 if self.cancel.is_cancelled() {
                     info!("dropping out of flush loop for timeline shutdown");
@@ -3161,7 +3273,9 @@ impl Timeline {
                     break Ok(());
                 };
                 match self.flush_frozen_layer(layer_to_flush, ctx).await {
-                    Ok(()) => {}
+                    Ok(this_layer_to_lsn) => {
+                        flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn);
+                    }
                     Err(FlushLayerError::Cancelled) => {
                         info!("dropping out of flush loop for timeline shutdown");
                         return;
@@ -3170,11 +3284,36 @@ impl Timeline {
                         FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
                     ) => {
                         error!("could not flush frozen layer: {err:?}");
-                        break err;
+                        break err.map(|_| ());
                     }
                 }
                 timer.stop_and_record();
             };
+
+            // Unsharded tenants should never advance their LSN beyond the end of the
+            // highest layer they write: such gaps between layer data and the frozen LSN
+            // are only legal on sharded tenants.
+            debug_assert!(
+                self.shard_identity.count.count() > 1
+                    || flushed_to_lsn >= frozen_to_lsn
+                    || !flushed_to_lsn.is_valid()
+            );
+
+            if flushed_to_lsn < frozen_to_lsn && self.shard_identity.count.count() > 1 {
+                // If our layer flushes didn't carry disk_consistent_lsn up to the `to_lsn` advertised
+                // to us via layer_flush_start_rx, then advance it here.
+                //
+                // This path is only taken for tenants with multiple shards: single sharded tenants should
+                // never encounter a gap in the wal.
+                let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
+                tracing::debug!("Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}");
+                if self.set_disk_consistent_lsn(frozen_to_lsn) {
+                    if let Err(e) = self.schedule_uploads(frozen_to_lsn, vec![]) {
+                        tracing::warn!("Failed to schedule metadata upload after updating disk_consistent_lsn: {e}");
+                    }
+                }
+            }
+
             // Notify any listeners that we're done
             let _ = self
                 .layer_flush_done_tx
@@ -3182,7 +3321,13 @@ impl Timeline {
         }
     }
 
-    async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> {
+    /// Request the flush loop to write out all frozen layers up to `to_lsn` as Delta L0 files to disk.
+    /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer`].
+    ///
+    /// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
+    /// it means no data will be written between the top of the highest frozen layer and to_lsn,
+    /// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
+    async fn flush_frozen_layers_and_wait(&self, last_record_lsn: Lsn) -> anyhow::Result<()> {
         let mut rx = self.layer_flush_done_tx.subscribe();
 
         // Increment the flush cycle counter and wake up the flush task.
@@ -3196,9 +3341,10 @@ impl Timeline {
             anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
         }
 
-        self.layer_flush_start_tx.send_modify(|counter| {
+        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
             my_flush_request = *counter + 1;
             *counter = my_flush_request;
+            *lsn = std::cmp::max(last_record_lsn, *lsn);
         });
 
         loop {
@@ -3235,16 +3381,22 @@ impl Timeline {
     }
 
     fn flush_frozen_layers(&self) {
-        self.layer_flush_start_tx.send_modify(|val| *val += 1);
+        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
+            *counter += 1;
+
+            *lsn = std::cmp::max(*lsn, Lsn(self.last_freeze_at.load().0 - 1));
+        });
     }
 
     /// Flush one frozen in-memory layer to disk, as a new delta layer.
+    ///
+    /// Return value is the last lsn (inclusive) of the layer that was frozen.
     #[instrument(skip_all, fields(layer=%frozen_layer))]
     async fn flush_frozen_layer(
         self: &Arc<Self>,
         frozen_layer: Arc<InMemoryLayer>,
         ctx: &RequestContext,
-    ) -> Result<(), FlushLayerError> {
+    ) -> Result<Lsn, FlushLayerError> {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
         // As a special case, when we have just imported an image into the repository,
@@ -3319,7 +3471,6 @@ impl Timeline {
         }
 
         let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
-        let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
 
         // The new on-disk layers are now in the layer map. We can remove the
         // in-memory layer from the map now. The flushed layer is stored in
@@ -3333,10 +3484,7 @@ impl Timeline {
 
             guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);
 
-            if disk_consistent_lsn != old_disk_consistent_lsn {
-                assert!(disk_consistent_lsn > old_disk_consistent_lsn);
-                self.disk_consistent_lsn.store(disk_consistent_lsn);
-
+            if self.set_disk_consistent_lsn(disk_consistent_lsn) {
                 // Schedule remote uploads that will reflect our new disk_consistent_lsn
                 self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
             }
@@ -3353,7 +3501,22 @@ impl Timeline {
         // This failpoint is used by another test case `test_pageserver_recovery`.
         fail_point!("flush-frozen-exit");
 
-        Ok(())
+        Ok(Lsn(lsn_range.end.0 - 1))
+    }
+
+    /// Return true if the value changed
+    ///
+    /// This function must only be used from the layer flush task, and may not be called concurrently.
+    fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
+        // We do a simple load/store cycle: that's why this function isn't safe for concurrent use.
+        let old_value = self.disk_consistent_lsn.load();
+        if new_value != old_value {
+            assert!(new_value >= old_value);
+            self.disk_consistent_lsn.store(new_value);
+            true
+        } else {
+            false
+        }
     }
 
     /// Update metadata file
@@ -3873,6 +4036,24 @@ impl Timeline {
         Ok(())
     }
 
+    /// Schedules the uploads of the given image layers
+    fn upload_new_image_layers(
+        self: &Arc<Self>,
+        new_images: impl IntoIterator<Item = ResidentLayer>,
+    ) -> anyhow::Result<()> {
+        let Some(remote_client) = &self.remote_client else {
+            return Ok(());
+        };
+        for layer in new_images {
+            remote_client.schedule_layer_file_upload(layer)?;
+        }
+        // should any new image layer been created, not uploading index_part will
+        // result in a mismatch between remote_physical_size and layermap calculated
+        // size, which will fail some tests, but should not be an issue otherwise.
+        remote_client.schedule_index_upload_for_file_changes()?;
+        Ok(())
+    }
+
     /// Update information about which layer files need to be retained on
     /// garbage collection. This is separate from actually performing the GC,
     /// and is updated more frequently, so that compaction can remove obsolete
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 74b75dabf0..ab001bf10d 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -125,18 +125,8 @@ impl Timeline {
                     )
                     .await
                     .map_err(anyhow::Error::from)?;
-                if let Some(remote_client) = &self.remote_client {
-                    for layer in layers {
-                        remote_client.schedule_layer_file_upload(layer)?;
-                    }
-                }
 
-                if let Some(remote_client) = &self.remote_client {
-                    // should any new image layer been created, not uploading index_part will
-                    // result in a mismatch between remote_physical_size and layermap calculated
-                    // size, which will fail some tests, but should not be an issue otherwise.
-                    remote_client.schedule_index_upload_for_file_changes()?;
-                }
+                self.upload_new_image_layers(layers)?;
             }
             Err(err) => {
                 // no partitioning? This is normal, if the timeline was just created
@@ -818,7 +808,10 @@ impl TimelineAdaptor {
         self.timeline
             .finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete)
             .await?;
-        self.new_images.clear();
+
+        self.timeline
+            .upload_new_image_layers(std::mem::take(&mut self.new_images))?;
+
         self.new_deltas.clear();
         self.layers_to_delete.clear();
         Ok(())
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index ab0a88c764..af10c1c84b 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -6,7 +6,7 @@ use std::{
 use anyhow::Context;
 use pageserver_api::{models::TimelineState, shard::TenantShardId};
 use tokio::sync::OwnedMutexGuard;
-use tracing::{debug, error, info, instrument, Instrument};
+use tracing::{error, info, instrument, Instrument};
 use utils::{crashsafe, fs_ext, id::TimelineId};
 
 use crate::{
@@ -14,7 +14,6 @@ use crate::{
     deletion_queue::DeletionQueueClient,
     task_mgr::{self, TaskKind},
     tenant::{
-        debug_assert_current_span_has_tenant_and_timeline_id,
         metadata::TimelineMetadata,
         remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
         CreateTimelineCause, DeleteTimelineError, Tenant,
@@ -23,58 +22,6 @@ use crate::{
 
 use super::{Timeline, TimelineResources};
 
-/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
-async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
-    debug_assert_current_span_has_tenant_and_timeline_id();
-    // Notify any timeline work to drop out of loops/requests
-    tracing::debug!("Cancelling CancellationToken");
-    timeline.cancel.cancel();
-
-    // Stop the walreceiver first.
-    debug!("waiting for wal receiver to shutdown");
-    let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
-    if let Some(walreceiver) = maybe_started_walreceiver {
-        walreceiver.stop().await;
-    }
-    debug!("wal receiver shutdown confirmed");
-
-    // Shut down the layer flush task before the remote client, as one depends on the other
-    task_mgr::shutdown_tasks(
-        Some(TaskKind::LayerFlushTask),
-        Some(timeline.tenant_shard_id),
-        Some(timeline.timeline_id),
-    )
-    .await;
-
-    // Prevent new uploads from starting.
-    if let Some(remote_client) = timeline.remote_client.as_ref() {
-        remote_client.stop();
-    }
-
-    // Stop & wait for the remaining timeline tasks, including upload tasks.
-    // NB: This and other delete_timeline calls do not run as a task_mgr task,
-    //     so, they are not affected by this shutdown_tasks() call.
-    info!("waiting for timeline tasks to shutdown");
-    task_mgr::shutdown_tasks(
-        None,
-        Some(timeline.tenant_shard_id),
-        Some(timeline.timeline_id),
-    )
-    .await;
-
-    fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: timeline-delete-before-index-deleted-at"
-        ))?
-    });
-
-    tracing::debug!("Waiting for gate...");
-    timeline.gate.close().await;
-    tracing::debug!("Shutdown complete");
-
-    Ok(())
-}
-
 /// Mark timeline as deleted in S3 so we won't pick it up next time
 /// during attach or pageserver restart.
 /// See comment in persist_index_part_with_deleted_flag.
@@ -268,7 +215,14 @@ impl DeleteTimelineFlow {
 
         guard.mark_in_progress()?;
 
-        stop_tasks(&timeline).await?;
+        // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
+        timeline.shutdown(super::ShutdownMode::Hard).await;
+
+        fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: timeline-delete-before-index-deleted-at"
+            ))?
+        });
 
         set_deleted_in_remote_index(&timeline).await?;
 
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index ebcd70bd39..522c5b57de 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -67,20 +67,19 @@ impl Timeline {
             ),
             false,
             async move {
-                let cancel = task_mgr::shutdown_token();
                 tokio::select! {
-                    _ = cancel.cancelled() => { return Ok(()); }
+                    _ = self_clone.cancel.cancelled() => { return Ok(()); }
                     _ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
                 };
 
-                self_clone.eviction_task(parent, cancel).await;
+                self_clone.eviction_task(parent).await;
                 Ok(())
             },
         );
     }
 
     #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
-    async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>, cancel: CancellationToken) {
+    async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>) {
         use crate::tenant::tasks::random_init_delay;
 
         // acquire the gate guard only once within a useful span
@@ -95,7 +94,7 @@ impl Timeline {
                 EvictionPolicy::OnlyImitiate(lat) => lat.period,
                 EvictionPolicy::NoEviction => Duration::from_secs(10),
             };
-            if random_init_delay(period, &cancel).await.is_err() {
+            if random_init_delay(period, &self.cancel).await.is_err() {
                 return;
             }
         }
@@ -104,13 +103,13 @@ impl Timeline {
         loop {
             let policy = self.get_eviction_policy();
             let cf = self
-                .eviction_iteration(&tenant, &policy, &cancel, &guard, &ctx)
+                .eviction_iteration(&tenant, &policy, &self.cancel, &guard, &ctx)
                 .await;
 
             match cf {
                 ControlFlow::Break(()) => break,
                 ControlFlow::Continue(sleep_until) => {
-                    if tokio::time::timeout_at(sleep_until, cancel.cancelled())
+                    if tokio::time::timeout_at(sleep_until, self.cancel.cancelled())
                         .await
                         .is_ok()
                     {
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index d54dc1642c..64edcc5e40 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -120,9 +120,10 @@ impl LayerManager {
     /// Called from `freeze_inmem_layer`, returns true if successfully frozen.
     pub(crate) async fn try_freeze_in_memory_layer(
         &mut self,
-        Lsn(last_record_lsn): Lsn,
+        lsn: Lsn,
         last_freeze_at: &AtomicLsn,
     ) {
+        let Lsn(last_record_lsn) = lsn;
         let end_lsn = Lsn(last_record_lsn + 1);
 
         if let Some(open_layer) = &self.layer_map.open_layer {
@@ -135,8 +136,11 @@ impl LayerManager {
             self.layer_map.frozen_layers.push_back(open_layer_rc);
             self.layer_map.open_layer = None;
             self.layer_map.next_open_layer_at = Some(end_lsn);
-            last_freeze_at.store(end_lsn);
         }
+
+        // Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this
+        // accounts for regions in the LSN range where we might have ingested no data due to sharding.
+        last_freeze_at.store(end_lsn);
     }
 
     /// Add image layers to the layer map, called from `create_image_layers`.
diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index f1b62067f9..a085154a5a 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -24,13 +24,12 @@ mod connection_manager;
 mod walreceiver_connection;
 
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
+use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME};
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::timeline::walreceiver::connection_manager::{
     connection_manager_loop_step, ConnectionManagerState,
 };
 
-use pageserver_api::shard::TenantShardId;
 use std::future::Future;
 use std::num::NonZeroU64;
 use std::sync::Arc;
@@ -40,8 +39,6 @@ use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 
-use utils::id::TimelineId;
-
 use self::connection_manager::ConnectionManagerStatus;
 
 use super::Timeline;
@@ -60,9 +57,10 @@ pub struct WalReceiverConf {
 }
 
 pub struct WalReceiver {
-    tenant_shard_id: TenantShardId,
-    timeline_id: TimelineId,
     manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
+    /// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token.
+    /// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`.
+    cancel: CancellationToken,
 }
 
 impl WalReceiver {
@@ -76,23 +74,23 @@ impl WalReceiver {
         let timeline_id = timeline.timeline_id;
         let walreceiver_ctx =
             ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
-
         let loop_status = Arc::new(std::sync::RwLock::new(None));
         let manager_status = Arc::clone(&loop_status);
-        task_mgr::spawn(
-            WALRECEIVER_RUNTIME.handle(),
-            TaskKind::WalReceiverManager,
-            Some(timeline.tenant_shard_id),
-            Some(timeline_id),
-            &format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"),
-            false,
+        let cancel = timeline.cancel.child_token();
+        WALRECEIVER_RUNTIME.spawn({
+            let cancel = cancel.clone();
             async move {
                 debug_assert_current_span_has_tenant_and_timeline_id();
+                // acquire timeline gate so we know the task doesn't outlive the Timeline
+                let Ok(_guard) = timeline.gate.enter() else {
+                    debug!("WAL receiver manager could not enter the gate timeline gate, it's closed already");
+                    return;
+                };
                 debug!("WAL receiver manager started, connecting to broker");
-                let cancel = task_mgr::shutdown_token();
                 let mut connection_manager_state = ConnectionManagerState::new(
                     timeline,
                     conf,
+                    cancel.clone(),
                 );
                 while !cancel.is_cancelled() {
                     let loop_step_result = connection_manager_loop_step(
@@ -112,25 +110,22 @@ impl WalReceiver {
                 }
                 connection_manager_state.shutdown().await;
                 *loop_status.write().unwrap() = None;
-                Ok(())
+                debug!("task exits");
             }
             .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id))
-        );
+        });
 
         Self {
-            tenant_shard_id,
-            timeline_id,
             manager_status,
+            cancel,
         }
     }
 
-    pub async fn stop(self) {
-        task_mgr::shutdown_tasks(
-            Some(TaskKind::WalReceiverManager),
-            Some(self.tenant_shard_id),
-            Some(self.timeline_id),
-        )
-        .await;
+    #[instrument(skip_all, level = tracing::Level::DEBUG)]
+    pub fn cancel(&self) {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+        debug!("cancelling walreceiver tasks");
+        self.cancel.cancel();
     }
 
     pub(crate) fn status(&self) -> Option<ConnectionManagerStatus> {
@@ -164,14 +159,18 @@ enum TaskStateUpdate<E> {
 
 impl<E: Clone> TaskHandle<E> {
     /// Initializes the task, starting it immediately after the creation.
+    ///
+    /// The second argument to `task` is a child token of `cancel_parent` ([`CancellationToken::child_token`]).
+    /// It being a child token enables us to provide a [`Self::shutdown`] method.
     fn spawn<Fut>(
+        cancel_parent: &CancellationToken,
         task: impl FnOnce(watch::Sender<TaskStateUpdate<E>>, CancellationToken) -> Fut + Send + 'static,
     ) -> Self
     where
         Fut: Future<Output = anyhow::Result<()>> + Send,
         E: Send + Sync + 'static,
     {
-        let cancellation = CancellationToken::new();
+        let cancellation = cancel_parent.child_token();
         let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started);
 
         let cancellation_clone = cancellation.clone();
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 030d24a017..dae31934ad 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -280,6 +280,8 @@ pub(super) struct ConnectionManagerState {
     id: TenantTimelineId,
     /// Use pageserver data about the timeline to filter out some of the safekeepers.
     timeline: Arc<Timeline>,
+    /// Child token of [`super::WalReceiver::cancel`], inherited to all tasks we spawn.
+    cancel: CancellationToken,
     conf: WalReceiverConf,
     /// Current connection to safekeeper for WAL streaming.
     wal_connection: Option<WalConnection>,
@@ -402,7 +404,11 @@ struct BrokerSkTimeline {
 }
 
 impl ConnectionManagerState {
-    pub(super) fn new(timeline: Arc<Timeline>, conf: WalReceiverConf) -> Self {
+    pub(super) fn new(
+        timeline: Arc<Timeline>,
+        conf: WalReceiverConf,
+        cancel: CancellationToken,
+    ) -> Self {
         let id = TenantTimelineId {
             tenant_id: timeline.tenant_shard_id.tenant_id,
             timeline_id: timeline.timeline_id,
@@ -410,6 +416,7 @@ impl ConnectionManagerState {
         Self {
             id,
             timeline,
+            cancel,
             conf,
             wal_connection: None,
             wal_stream_candidates: HashMap::new(),
@@ -417,6 +424,22 @@ impl ConnectionManagerState {
         }
     }
 
+    fn spawn<Fut>(
+        &self,
+        task: impl FnOnce(
+                tokio::sync::watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
+                CancellationToken,
+            ) -> Fut
+            + Send
+            + 'static,
+    ) -> TaskHandle<WalConnectionStatus>
+    where
+        Fut: std::future::Future<Output = anyhow::Result<()>> + Send,
+    {
+        // TODO: get rid of TaskHandle
+        super::TaskHandle::spawn(&self.cancel, task)
+    }
+
     /// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
     async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
         WALRECEIVER_SWITCHES
@@ -435,7 +458,7 @@ impl ConnectionManagerState {
         );
 
         let span = info_span!("connection", %node_id);
-        let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
+        let connection_handle = self.spawn(move |events_sender, cancellation| {
             async move {
                 debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -463,6 +486,12 @@ impl ConnectionManagerState {
                                 info!("walreceiver connection handling ended: {e}");
                                 Ok(())
                             }
+                            WalReceiverError::ClosedGate => {
+                                info!(
+                                    "walreceiver connection handling ended because of closed gate"
+                                );
+                                Ok(())
+                            }
                             WalReceiverError::Other(e) => {
                                 // give out an error to have task_mgr give it a really verbose logging
                                 if cancellation.is_cancelled() {
@@ -1016,7 +1045,7 @@ mod tests {
             sk_id: connected_sk_id,
             availability_zone: None,
             status: connection_status,
-            connection_task: TaskHandle::spawn(move |sender, _| async move {
+            connection_task: state.spawn(move |sender, _| async move {
                 sender
                     .send(TaskStateUpdate::Progress(connection_status))
                     .ok();
@@ -1184,7 +1213,7 @@ mod tests {
             sk_id: connected_sk_id,
             availability_zone: None,
             status: connection_status,
-            connection_task: TaskHandle::spawn(move |sender, _| async move {
+            connection_task: state.spawn(move |sender, _| async move {
                 sender
                     .send(TaskStateUpdate::Progress(connection_status))
                     .ok();
@@ -1251,7 +1280,7 @@ mod tests {
             sk_id: NodeId(1),
             availability_zone: None,
             status: connection_status,
-            connection_task: TaskHandle::spawn(move |sender, _| async move {
+            connection_task: state.spawn(move |sender, _| async move {
                 sender
                     .send(TaskStateUpdate::Progress(connection_status))
                     .ok();
@@ -1315,7 +1344,7 @@ mod tests {
             sk_id: NodeId(1),
             availability_zone: None,
             status: connection_status,
-            connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }),
+            connection_task: state.spawn(move |_, _| async move { Ok(()) }),
             discovered_new_wal: Some(NewCommittedWAL {
                 discovered_at: time_over_threshold,
                 lsn: new_lsn,
@@ -1371,6 +1400,7 @@ mod tests {
                 timeline_id: TIMELINE_ID,
             },
             timeline,
+            cancel: CancellationToken::new(),
             conf: WalReceiverConf {
                 wal_connect_timeout: Duration::from_secs(1),
                 lagging_wal_timeout: Duration::from_secs(1),
@@ -1414,7 +1444,7 @@ mod tests {
             sk_id: connected_sk_id,
             availability_zone: None,
             status: connection_status,
-            connection_task: TaskHandle::spawn(move |sender, _| async move {
+            connection_task: state.spawn(move |sender, _| async move {
                 sender
                     .send(TaskStateUpdate::Progress(connection_status))
                     .ok();
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 00a9dbd760..3f3419e886 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -27,7 +27,6 @@ use super::TaskStateUpdate;
 use crate::{
     context::RequestContext,
     metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
-    task_mgr,
     task_mgr::TaskKind,
     task_mgr::WALRECEIVER_RUNTIME,
     tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
@@ -37,8 +36,8 @@ use crate::{
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use utils::pageserver_feedback::PageserverFeedback;
 use utils::{id::NodeId, lsn::Lsn};
+use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
 
 /// Status of the connection.
 #[derive(Debug, Clone, Copy)]
@@ -68,6 +67,7 @@ pub(super) enum WalReceiverError {
     SuccessfulCompletion(String),
     /// Generic error
     Other(anyhow::Error),
+    ClosedGate,
 }
 
 impl From<tokio_postgres::Error> for WalReceiverError {
@@ -119,6 +119,16 @@ pub(super) async fn handle_walreceiver_connection(
 ) -> Result<(), WalReceiverError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
+    // prevent timeline shutdown from finishing until we have exited
+    let _guard = timeline.gate.enter().map_err(|e| match e {
+        GateError::GateClosed => WalReceiverError::ClosedGate,
+    })?;
+    // This function spawns a side-car task (WalReceiverConnectionPoller).
+    // Get its gate guard now as well.
+    let poller_guard = timeline.gate.enter().map_err(|e| match e {
+        GateError::GateClosed => WalReceiverError::ClosedGate,
+    })?;
+
     WALRECEIVER_STARTED_CONNECTIONS.inc();
 
     // Connect to the database in replication mode.
@@ -156,22 +166,19 @@ pub(super) async fn handle_walreceiver_connection(
     }
 
     // The connection object performs the actual communication with the database,
-    // so spawn it off to run on its own.
+    // so spawn it off to run on its own. It shouldn't outlive this function, but,
+    // due to lack of async drop, we can't enforce that. However, we ensure that
+    // 1. it is sensitive to `cancellation` and
+    // 2. holds the Timeline gate open so that after timeline shutdown,
+    //    we know this task is gone.
     let _connection_ctx = ctx.detached_child(
         TaskKind::WalReceiverConnectionPoller,
         ctx.download_behavior(),
     );
     let connection_cancellation = cancellation.clone();
-    task_mgr::spawn(
-        WALRECEIVER_RUNTIME.handle(),
-        TaskKind::WalReceiverConnectionPoller,
-        Some(timeline.tenant_shard_id),
-        Some(timeline.timeline_id),
-        "walreceiver connection",
-        false,
+    WALRECEIVER_RUNTIME.spawn(
         async move {
             debug_assert_current_span_has_tenant_and_timeline_id();
-
             select! {
                 connection_result = connection => match connection_result {
                     Ok(()) => debug!("Walreceiver db connection closed"),
@@ -182,6 +189,9 @@ pub(super) async fn handle_walreceiver_connection(
                                 // with a similar error.
                             },
                             WalReceiverError::SuccessfulCompletion(_) => {}
+                            WalReceiverError::ClosedGate => {
+                                // doesn't happen at runtime
+                            }
                             WalReceiverError::Other(err) => {
                                 warn!("Connection aborted: {err:#}")
                             }
@@ -190,7 +200,7 @@ pub(super) async fn handle_walreceiver_connection(
                 },
                 _ = connection_cancellation.cancelled() => debug!("Connection cancelled"),
             }
-            Ok(())
+            drop(poller_guard);
         }
         // Enrich the log lines emitted by this closure with meaningful context.
         // TODO: technically, this task outlives the surrounding function, so, the
@@ -303,6 +313,7 @@ pub(super) async fn handle_walreceiver_connection(
 
                 trace!("received XLogData between {startlsn} and {endlsn}");
 
+                WAL_INGEST.bytes_received.inc_by(data.len() as u64);
                 waldecoder.feed_bytes(data);
 
                 {
diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs
index 830c9897ca..5eccf185ac 100644
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -15,11 +15,23 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtiliz
         .map_err(std::io::Error::from)
         .context("statvfs tenants directory")?;
 
-    let blocksz = statvfs.block_size();
+    // https://unix.stackexchange.com/a/703650
+    let blocksz = if statvfs.fragment_size() > 0 {
+        statvfs.fragment_size()
+    } else {
+        statvfs.block_size()
+    };
 
     #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))]
     let free = statvfs.blocks_available() as u64 * blocksz;
-    let used = crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.get();
+
+    #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))]
+    let used = statvfs
+        .blocks()
+        // use blocks_free instead of available here to match df in case someone compares
+        .saturating_sub(statvfs.blocks_free()) as u64
+        * blocksz;
+
     let captured_at = std::time::SystemTime::now();
 
     let doc = PageserverUtilization {
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 0004f4f3c9..ca41a576fd 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -36,11 +36,12 @@ use bytes::{Bytes, BytesMut};
 use pageserver_api::key::key_to_rel_block;
 use pageserver_api::models::WalRedoManagerStatus;
 use pageserver_api::shard::TenantShardId;
-use std::sync::{Arc, RwLock};
+use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
 use tracing::*;
 use utils::lsn::Lsn;
+use utils::sync::heavier_once_cell;
 
 ///
 /// This is the real implementation that uses a Postgres process to
@@ -53,7 +54,19 @@ pub struct PostgresRedoManager {
     tenant_shard_id: TenantShardId,
     conf: &'static PageServerConf,
     last_redo_at: std::sync::Mutex<Option<Instant>>,
-    redo_process: RwLock<Option<Arc<process::WalRedoProcess>>>,
+    /// The current [`process::WalRedoProcess`] that is used by new redo requests.
+    /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
+    /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
+    /// their process object; we use [`Arc::clone`] for that.
+    /// This is primarily because earlier implementations that didn't  use [`heavier_once_cell`]
+    /// had that behavior; it's probably unnecessary.
+    /// The only merit of it is that if one walredo process encounters an error,
+    /// it can take it out of rotation (= using [`heavier_once_cell::Guard::take_and_deinit`].
+    /// and retry redo, thereby starting the new process, while other redo tasks might
+    /// still be using the old redo process. But, those other tasks will most likely
+    /// encounter an error as well, and errors are an unexpected condition anyway.
+    /// So, probably we could get rid of the `Arc` in the future.
+    redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
 }
 
 ///
@@ -101,6 +114,7 @@ impl PostgresRedoManager {
                         self.conf.wal_redo_timeout,
                         pg_version,
                     )
+                    .await
                 };
                 img = Some(result?);
 
@@ -121,6 +135,7 @@ impl PostgresRedoManager {
                 self.conf.wal_redo_timeout,
                 pg_version,
             )
+            .await
         }
     }
 
@@ -134,7 +149,7 @@ impl PostgresRedoManager {
                     chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
                 })
             },
-            pid: self.redo_process.read().unwrap().as_ref().map(|p| p.id()),
+            pid: self.redo_process.get().map(|p| p.id()),
         })
     }
 }
@@ -152,7 +167,7 @@ impl PostgresRedoManager {
             tenant_shard_id,
             conf,
             last_redo_at: std::sync::Mutex::default(),
-            redo_process: RwLock::new(None),
+            redo_process: heavier_once_cell::OnceCell::default(),
         }
     }
 
@@ -164,8 +179,7 @@ impl PostgresRedoManager {
             if let Some(last_redo_at) = *g {
                 if last_redo_at.elapsed() >= idle_timeout {
                     drop(g);
-                    let mut guard = self.redo_process.write().unwrap();
-                    *guard = None;
+                    drop(self.redo_process.get().map(|guard| guard.take_and_deinit()));
                 }
             }
         }
@@ -174,8 +188,11 @@ impl PostgresRedoManager {
     ///
     /// Process one request for WAL redo using wal-redo postgres
     ///
+    /// # Cancel-Safety
+    ///
+    /// Cancellation safe.
     #[allow(clippy::too_many_arguments)]
-    fn apply_batch_postgres(
+    async fn apply_batch_postgres(
         &self,
         key: Key,
         lsn: Lsn,
@@ -191,42 +208,31 @@ impl PostgresRedoManager {
         const MAX_RETRY_ATTEMPTS: u32 = 1;
         let mut n_attempts = 0u32;
         loop {
-            // launch the WAL redo process on first use
-            let proc: Arc<process::WalRedoProcess> = {
-                let proc_guard = self.redo_process.read().unwrap();
-                match &*proc_guard {
-                    None => {
-                        // "upgrade" to write lock to launch the process
-                        drop(proc_guard);
-                        let mut proc_guard = self.redo_process.write().unwrap();
-                        match &*proc_guard {
-                            None => {
-                                let start = Instant::now();
-                                let proc = Arc::new(
-                                    process::WalRedoProcess::launch(
-                                        self.conf,
-                                        self.tenant_shard_id,
-                                        pg_version,
-                                    )
-                                    .context("launch walredo process")?,
-                                );
-                                let duration = start.elapsed();
-                                WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM
-                                    .observe(duration.as_secs_f64());
-                                info!(
-                                    duration_ms = duration.as_millis(),
-                                    pid = proc.id(),
-                                    "launched walredo process"
-                                );
-                                *proc_guard = Some(Arc::clone(&proc));
-                                proc
-                            }
-                            Some(proc) => Arc::clone(proc),
-                        }
+            let proc: Arc<process::WalRedoProcess> =
+                match self.redo_process.get_or_init_detached().await {
+                    Ok(guard) => Arc::clone(&guard),
+                    Err(permit) => {
+                        // don't hold poison_guard, the launch code can bail
+                        let start = Instant::now();
+                        let proc = Arc::new(
+                            process::WalRedoProcess::launch(
+                                self.conf,
+                                self.tenant_shard_id,
+                                pg_version,
+                            )
+                            .context("launch walredo process")?,
+                        );
+                        let duration = start.elapsed();
+                        WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
+                        info!(
+                            duration_ms = duration.as_millis(),
+                            pid = proc.id(),
+                            "launched walredo process"
+                        );
+                        self.redo_process.set(Arc::clone(&proc), permit);
+                        proc
                     }
-                    Some(proc) => Arc::clone(proc),
-                }
-            };
+                };
 
             let started_at = std::time::Instant::now();
 
@@ -272,34 +278,34 @@ impl PostgresRedoManager {
                     n_attempts,
                     e,
                 );
-                // Avoid concurrent callers hitting the same issue.
-                // We can't prevent it from happening because we want to enable parallelism.
-                {
-                    let mut guard = self.redo_process.write().unwrap();
-                    match &*guard {
-                        Some(current_field_value) => {
-                            if Arc::ptr_eq(current_field_value, &proc) {
-                                // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
-                                *guard = None;
-                            }
-                        }
-                        None => {
-                            // Another thread was faster to observe the error, and already took the process out of rotation.
-                        }
-                    }
-                }
+                // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
+                // Note that there may be other tasks concurrent with us that also hold `proc`.
+                // We have to deal with that here.
+                // Also read the doc comment on field `self.redo_process`.
+                //
                 // NB: there may still be other concurrent threads using `proc`.
                 // The last one will send SIGKILL when the underlying Arc reaches refcount 0.
-                // NB: it's important to drop(proc) after drop(guard). Otherwise we'd keep
-                // holding the lock while waiting for the process to exit.
-                // NB: the drop impl blocks the current threads with a wait() system call for
-                // the child process. We dropped the `guard` above so that other threads aren't
-                // affected. But, it's good that the current thread _does_ block to wait.
-                // If we instead deferred the waiting into the background / to tokio, it could
-                // happen that if walredo always fails immediately, we spawn processes faster
+                //
+                // NB: the drop impl blocks the dropping thread with a wait() system call for
+                // the child process. In some ways the blocking is actually good: if we
+                // deferred the waiting into the background / to tokio if we used `tokio::process`,
+                // it could happen that if walredo always fails immediately, we spawn processes faster
                 // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
                 // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
                 // This probably needs revisiting at some later point.
+                match self.redo_process.get() {
+                    None => (),
+                    Some(guard) => {
+                        if Arc::ptr_eq(&proc, &*guard) {
+                            // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
+                            guard.take_and_deinit();
+                        } else {
+                            // Another task already spawned another redo process (further up in this method)
+                            // and put it into `redo_process`. Do nothing, our view of the world is behind.
+                        }
+                    }
+                }
+                // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
                 drop(proc);
             } else if n_attempts != 0 {
                 info!(n_attempts, "retried walredo succeeded");
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 1bc8a2e87c..2276b4e807 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -495,16 +495,17 @@ retry:
 static void
 pageserver_disconnect(shardno_t shard_no)
 {
-	if (page_servers[shard_no].conn)
-	{
-		/*
-		 * If the connection to any pageserver is lost, we throw away the
-		 * whole prefetch queue, even for other pageservers. It should not
-		 * cause big problems, because connection loss is supposed to be a
-		 * rare event.
-		 */
-		prefetch_on_ps_disconnect();
-	}
+	/*
+	 * If the connection to any pageserver is lost, we throw away the
+	 * whole prefetch queue, even for other pageservers. It should not
+	 * cause big problems, because connection loss is supposed to be a
+	 * rare event.
+	 *
+	 * Prefetch state should be reset even if page_servers[shard_no].conn == NULL,
+	 * because prefetch request may be registered before connection is established.
+	 */
+	prefetch_on_ps_disconnect();
+
 	pageserver_disconnect_shard(shard_no);
 }
 
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index b33cfab2bb..57a16e00ca 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -641,13 +641,12 @@ prefetch_on_ps_disconnect(void)
 static inline void
 prefetch_set_unused(uint64 ring_index)
 {
-	PrefetchRequest *slot = GetPrfSlot(ring_index);
+	PrefetchRequest *slot;
 
 	if (ring_index < MyPState->ring_last)
 		return;					/* Should already be unused */
 
-	Assert(MyPState->ring_unused > ring_index);
-
+	slot = GetPrfSlot(ring_index);
 	if (slot->status == PRFS_UNUSED)
 		return;
 
@@ -806,7 +805,8 @@ Retry:
 			{
 				if (*force_lsn > slot->effective_request_lsn)
 				{
-					prefetch_wait_for(ring_index);
+					if (!prefetch_wait_for(ring_index))
+						goto Retry;
 					prefetch_set_unused(ring_index);
 					entry = NULL;
 				}
@@ -821,7 +821,8 @@ Retry:
 			{
 				if (*force_lsn != slot->effective_request_lsn)
 				{
-					prefetch_wait_for(ring_index);
+					if (!prefetch_wait_for(ring_index))
+						goto Retry;
 					prefetch_set_unused(ring_index);
 					entry = NULL;
 				}
@@ -887,7 +888,8 @@ Retry:
 			{
 				case PRFS_REQUESTED:
 					Assert(MyPState->ring_receive == cleanup_index);
-					prefetch_wait_for(cleanup_index);
+					if (!prefetch_wait_for(cleanup_index))
+						goto Retry;
 					prefetch_set_unused(cleanup_index);
 					break;
 				case PRFS_RECEIVED:
@@ -2140,6 +2142,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	/*
 	 * Try to find prefetched page in the list of received pages.
 	 */
+  Retry:
 	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);
 
 	if (entry != NULL)
@@ -2161,7 +2164,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			 */
 			if (slot->status == PRFS_REQUESTED)
 			{
-				prefetch_wait_for(slot->my_ring_index);
+				if (!prefetch_wait_for(slot->my_ring_index))
+					goto Retry;
 			}
 			/* drop caches */
 			prefetch_set_unused(slot->my_ring_index);
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 57a2736d5b..b327890be2 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -10,6 +10,7 @@ testing = []
 
 [dependencies]
 anyhow.workspace = true
+async-compression.workspace = true
 async-trait.workspace = true
 aws-config.workspace = true
 aws-sdk-iam.workspace = true
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index 7db76f3d9e..415a4b7d85 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -102,8 +102,7 @@ pub(super) async fn authenticate(
 
     ctx.set_user(db_info.user.into());
     ctx.set_project(db_info.aux.clone());
-    let cold_start_info = db_info.aux.cold_start_info.clone().unwrap_or_default();
-    info!(?cold_start_info, "woken up a compute node");
+    info!("woken up a compute node");
 
     // Backwards compatibility. pg_sni_proxy uses "--" in domain names
     // while direct connections do not. Once we migrate to pg_sni_proxy
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 385f7820cb..c28814b1c8 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -10,6 +10,7 @@ use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
 use proxy::context::RequestMonitoring;
 use proxy::proxy::run_until_cancelled;
+use proxy::{BranchId, EndpointId, ProjectId};
 use rustls::pki_types::PrivateKeyDer;
 use tokio::net::TcpListener;
 
@@ -269,7 +270,12 @@ async fn handle_client(
 
     let client = tokio::net::TcpStream::connect(destination).await?;
 
-    let metrics_aux: MetricsAuxInfo = Default::default();
+    let metrics_aux: MetricsAuxInfo = MetricsAuxInfo {
+        endpoint_id: (&EndpointId::from("")).into(),
+        project_id: (&ProjectId::from("")).into(),
+        branch_id: (&BranchId::from("")).into(),
+        cold_start_info: proxy::console::messages::ColdStartInfo::Unknown,
+    };
 
     // doesn't yet matter as pg-sni-router doesn't report analytics logs
     ctx.set_success();
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 88b847f5f1..56a3ef79cd 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -10,6 +10,7 @@ use proxy::auth;
 use proxy::auth::backend::MaybeOwned;
 use proxy::cancellation::CancelMap;
 use proxy::cancellation::CancellationHandler;
+use proxy::config::remote_storage_from_toml;
 use proxy::config::AuthenticationConfig;
 use proxy::config::CacheOptions;
 use proxy::config::HttpConfig;
@@ -191,6 +192,19 @@ struct ProxyCliArgs {
 
     #[clap(flatten)]
     parquet_upload: ParquetUploadArgs,
+
+    /// interval for backup metric collection
+    #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
+    metric_backup_collection_interval: std::time::Duration,
+    /// remote storage configuration for backup metric collection
+    /// Encoded as toml (same format as pageservers), eg
+    /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
+    #[clap(long, default_value = "{}")]
+    metric_backup_collection_remote_storage: String,
+    /// chunk size for backup metric collection
+    /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
+    #[clap(long, default_value = "4194304")]
+    metric_backup_collection_chunk_size: usize,
 }
 
 #[derive(clap::Args, Clone, Copy, Debug)]
@@ -372,12 +386,17 @@ async fn main() -> anyhow::Result<()> {
 
     // maintenance tasks. these never return unless there's an error
     let mut maintenance_tasks = JoinSet::new();
-    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token));
+    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
     maintenance_tasks.spawn(http::health_server::task_main(http_listener));
     maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));
 
     if let Some(metrics_config) = &config.metric_collection {
+        // TODO: Add gc regardles of the metric collection being enabled.
         maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
+        client_tasks.spawn(usage_metrics::task_backup(
+            &metrics_config.backup_metric_collection_config,
+            cancellation_token,
+        ));
     }
 
     if let auth::BackendType::Console(api, _) = &config.auth_backend {
@@ -434,6 +453,13 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
     if args.allow_self_signed_compute {
         warn!("allowing self-signed compute certificates");
     }
+    let backup_metric_collection_config = config::MetricBackupCollectionConfig {
+        interval: args.metric_backup_collection_interval,
+        remote_storage_config: remote_storage_from_toml(
+            &args.metric_backup_collection_remote_storage,
+        )?,
+        chunk_size: args.metric_backup_collection_chunk_size,
+    };
 
     let metric_collection = match (
         &args.metric_collection_endpoint,
@@ -442,6 +468,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig {
             endpoint: endpoint.parse()?,
             interval: humantime::parse_duration(interval)?,
+            backup_metric_collection_config,
         }),
         (None, None) => None,
         _ => bail!(
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index 5a3660520b..d8a1d261ce 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -16,7 +16,7 @@ use crate::{
     config::ProjectInfoCacheOptions,
     console::AuthSecret,
     intern::{EndpointIdInt, ProjectIdInt, RoleNameInt},
-    EndpointId, ProjectId, RoleName,
+    EndpointId, RoleName,
 };
 
 use super::{Cache, Cached};
@@ -214,14 +214,11 @@ impl ProjectInfoCacheImpl {
     }
     pub fn insert_role_secret(
         &self,
-        project_id: &ProjectId,
-        endpoint_id: &EndpointId,
-        role_name: &RoleName,
+        project_id: ProjectIdInt,
+        endpoint_id: EndpointIdInt,
+        role_name: RoleNameInt,
         secret: Option<AuthSecret>,
     ) {
-        let project_id = ProjectIdInt::from(project_id);
-        let endpoint_id = EndpointIdInt::from(endpoint_id);
-        let role_name = RoleNameInt::from(role_name);
         if self.cache.len() >= self.config.size {
             // If there are too many entries, wait until the next gc cycle.
             return;
@@ -234,12 +231,10 @@ impl ProjectInfoCacheImpl {
     }
     pub fn insert_allowed_ips(
         &self,
-        project_id: &ProjectId,
-        endpoint_id: &EndpointId,
+        project_id: ProjectIdInt,
+        endpoint_id: EndpointIdInt,
         allowed_ips: Arc<Vec<IpPattern>>,
     ) {
-        let project_id = ProjectIdInt::from(project_id);
-        let endpoint_id = EndpointIdInt::from(endpoint_id);
         if self.cache.len() >= self.config.size {
             // If there are too many entries, wait until the next gc cycle.
             return;
@@ -358,7 +353,7 @@ impl Cache for ProjectInfoCacheImpl {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::scram::ServerSecret;
+    use crate::{scram::ServerSecret, ProjectId};
 
     #[tokio::test]
     async fn test_project_info_cache_settings() {
@@ -369,8 +364,8 @@ mod tests {
             ttl: Duration::from_secs(1),
             gc_interval: Duration::from_secs(600),
         });
-        let project_id = "project".into();
-        let endpoint_id = "endpoint".into();
+        let project_id: ProjectId = "project".into();
+        let endpoint_id: EndpointId = "endpoint".into();
         let user1: RoleName = "user1".into();
         let user2: RoleName = "user2".into();
         let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
@@ -379,9 +374,23 @@ mod tests {
             "127.0.0.1".parse().unwrap(),
             "127.0.0.2".parse().unwrap(),
         ]);
-        cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
-        cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
-        cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
+        cache.insert_role_secret(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            (&user1).into(),
+            secret1.clone(),
+        );
+        cache.insert_role_secret(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            (&user2).into(),
+            secret2.clone(),
+        );
+        cache.insert_allowed_ips(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            allowed_ips.clone(),
+        );
 
         let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
         assert!(cached.cached());
@@ -393,7 +402,12 @@ mod tests {
         // Shouldn't add more than 2 roles.
         let user3: RoleName = "user3".into();
         let secret3 = Some(AuthSecret::Scram(ServerSecret::mock([3; 32])));
-        cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone());
+        cache.insert_role_secret(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            (&user3).into(),
+            secret3.clone(),
+        );
         assert!(cache.get_role_secret(&endpoint_id, &user3).is_none());
 
         let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
@@ -421,8 +435,8 @@ mod tests {
         cache.clone().disable_ttl();
         tokio::time::advance(Duration::from_secs(2)).await;
 
-        let project_id = "project".into();
-        let endpoint_id = "endpoint".into();
+        let project_id: ProjectId = "project".into();
+        let endpoint_id: EndpointId = "endpoint".into();
         let user1: RoleName = "user1".into();
         let user2: RoleName = "user2".into();
         let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
@@ -431,9 +445,23 @@ mod tests {
             "127.0.0.1".parse().unwrap(),
             "127.0.0.2".parse().unwrap(),
         ]);
-        cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
-        cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
-        cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
+        cache.insert_role_secret(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            (&user1).into(),
+            secret1.clone(),
+        );
+        cache.insert_role_secret(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            (&user2).into(),
+            secret2.clone(),
+        );
+        cache.insert_allowed_ips(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            allowed_ips.clone(),
+        );
 
         tokio::time::advance(Duration::from_secs(2)).await;
         // Nothing should be invalidated.
@@ -470,8 +498,8 @@ mod tests {
             gc_interval: Duration::from_secs(600),
         }));
 
-        let project_id = "project".into();
-        let endpoint_id = "endpoint".into();
+        let project_id: ProjectId = "project".into();
+        let endpoint_id: EndpointId = "endpoint".into();
         let user1: RoleName = "user1".into();
         let user2: RoleName = "user2".into();
         let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
@@ -480,10 +508,20 @@ mod tests {
             "127.0.0.1".parse().unwrap(),
             "127.0.0.2".parse().unwrap(),
         ]);
-        cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
+        cache.insert_role_secret(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            (&user1).into(),
+            secret1.clone(),
+        );
         cache.clone().disable_ttl();
         tokio::time::advance(Duration::from_millis(100)).await;
-        cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
+        cache.insert_role_secret(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            (&user2).into(),
+            secret2.clone(),
+        );
 
         // Added before ttl was disabled + ttl should be still cached.
         let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
@@ -497,7 +535,11 @@ mod tests {
         assert!(cache.get_role_secret(&endpoint_id, &user2).is_none());
 
         // Added after ttl was disabled + ttl should not be cached.
-        cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
+        cache.insert_allowed_ips(
+            (&project_id).into(),
+            (&endpoint_id).into(),
+            allowed_ips.clone(),
+        );
         let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
         assert!(!cached.cached());
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 65153babcb..ee33b97fbd 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -276,6 +276,7 @@ impl ConnCfg {
         let stream = connection.stream.into_inner();
 
         info!(
+            cold_start_info = ctx.cold_start_info.as_str(),
             "connected to compute node at {host} ({socket_addr}) sslmode={:?}",
             self.0.get_ssl_mode()
         );
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 361c3ef519..fc490c7348 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -5,6 +5,7 @@ use crate::{
 };
 use anyhow::{bail, ensure, Context, Ok};
 use itertools::Itertools;
+use remote_storage::RemoteStorageConfig;
 use rustls::{
     crypto::ring::sign,
     pki_types::{CertificateDer, PrivateKeyDer},
@@ -39,6 +40,7 @@ pub struct ProxyConfig {
 pub struct MetricCollectionConfig {
     pub endpoint: reqwest::Url,
     pub interval: Duration,
+    pub backup_metric_collection_config: MetricBackupCollectionConfig,
 }
 
 pub struct TlsConfig {
@@ -311,6 +313,21 @@ impl CertResolver {
     }
 }
 
+#[derive(Debug)]
+pub struct MetricBackupCollectionConfig {
+    pub interval: Duration,
+    pub remote_storage_config: OptRemoteStorageConfig,
+    pub chunk_size: usize,
+}
+
+/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get
+/// runtime type errors from the value parser we use.
+pub type OptRemoteStorageConfig = Option<RemoteStorageConfig>;
+
+pub fn remote_storage_from_toml(s: &str) -> anyhow::Result<OptRemoteStorageConfig> {
+    RemoteStorageConfig::from_toml(&s.parse()?)
+}
+
 /// Helper for cmdline cache options parsing.
 #[derive(Debug)]
 pub struct CacheOptions {
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 102076f2c6..45161f5ac8 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -3,7 +3,7 @@ use std::fmt;
 
 use crate::auth::IpPattern;
 
-use crate::{BranchId, EndpointId, ProjectId};
+use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
 
 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
@@ -18,7 +18,7 @@ pub struct ConsoleError {
 pub struct GetRoleSecret {
     pub role_secret: Box<str>,
     pub allowed_ips: Option<Vec<IpPattern>>,
-    pub project_id: Option<ProjectId>,
+    pub project_id: Option<ProjectIdInt>,
 }
 
 // Manually implement debug to omit sensitive info.
@@ -93,22 +93,47 @@ impl fmt::Debug for DatabaseInfo {
 
 /// Various labels for prometheus metrics.
 /// Also known as `ProxyMetricsAuxInfo` in the console.
-#[derive(Debug, Deserialize, Clone, Default)]
+#[derive(Debug, Deserialize, Clone)]
 pub struct MetricsAuxInfo {
-    pub endpoint_id: EndpointId,
-    pub project_id: ProjectId,
-    pub branch_id: BranchId,
-    pub cold_start_info: Option<ColdStartInfo>,
+    pub endpoint_id: EndpointIdInt,
+    pub project_id: ProjectIdInt,
+    pub branch_id: BranchIdInt,
+    #[serde(default)]
+    pub cold_start_info: ColdStartInfo,
 }
 
-#[derive(Debug, Default, Serialize, Deserialize, Clone)]
+#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy)]
 #[serde(rename_all = "snake_case")]
 pub enum ColdStartInfo {
     #[default]
-    Unknown = 0,
-    Warm = 1,
-    PoolHit = 2,
-    PoolMiss = 3,
+    Unknown,
+    /// Compute was already running
+    Warm,
+    #[serde(rename = "pool_hit")]
+    /// Compute was not running but there was an available VM
+    VmPoolHit,
+    #[serde(rename = "pool_miss")]
+    /// Compute was not running and there were no VMs available
+    VmPoolMiss,
+
+    // not provided by control plane
+    /// Connection available from HTTP pool
+    HttpPoolHit,
+    /// Cached connection info
+    WarmCached,
+}
+
+impl ColdStartInfo {
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            ColdStartInfo::Unknown => "unknown",
+            ColdStartInfo::Warm => "warm",
+            ColdStartInfo::VmPoolHit => "pool_hit",
+            ColdStartInfo::VmPoolMiss => "pool_miss",
+            ColdStartInfo::HttpPoolHit => "http_pool_hit",
+            ColdStartInfo::WarmCached => "warm_cached",
+        }
+    }
 }
 
 #[cfg(test)]
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 69bfd6b045..f7d621fb12 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -12,7 +12,8 @@ use crate::{
     compute,
     config::{CacheOptions, ProjectInfoCacheOptions},
     context::RequestMonitoring,
-    scram, EndpointCacheKey, ProjectId,
+    intern::ProjectIdInt,
+    scram, EndpointCacheKey,
 };
 use dashmap::DashMap;
 use std::{sync::Arc, time::Duration};
@@ -271,7 +272,7 @@ pub struct AuthInfo {
     /// List of IP addresses allowed for the autorization.
     pub allowed_ips: Vec<IpPattern>,
     /// Project ID. This is used for cache invalidation.
-    pub project_id: Option<ProjectId>,
+    pub project_id: Option<ProjectIdInt>,
 }
 
 /// Info for establishing a connection to a compute node.
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index b759c81373..cfe491f2aa 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -4,10 +4,16 @@ use super::{
     errors::{ApiError, GetAuthInfoError, WakeComputeError},
     AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
 };
-use crate::console::provider::{CachedAllowedIps, CachedRoleSecret};
 use crate::context::RequestMonitoring;
 use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
 use crate::{auth::IpPattern, cache::Cached};
+use crate::{
+    console::{
+        messages::MetricsAuxInfo,
+        provider::{CachedAllowedIps, CachedRoleSecret},
+    },
+    BranchId, EndpointId, ProjectId,
+};
 use futures::TryFutureExt;
 use std::{str::FromStr, sync::Arc};
 use thiserror::Error;
@@ -114,7 +120,12 @@ impl Api {
 
         let node = NodeInfo {
             config,
-            aux: Default::default(),
+            aux: MetricsAuxInfo {
+                endpoint_id: (&EndpointId::from("endpoint")).into(),
+                project_id: (&ProjectId::from("project")).into(),
+                branch_id: (&BranchId::from("branch")).into(),
+                cold_start_info: crate::console::messages::ColdStartInfo::Warm,
+            },
             allow_self_signed_compute: false,
         };
 
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 289b0c08f7..1a3e2ca795 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -181,15 +181,16 @@ impl super::Api for Api {
         }
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
         if let Some(project_id) = auth_info.project_id {
+            let ep_int = ep.into();
             self.caches.project_info.insert_role_secret(
-                &project_id,
-                ep,
-                user,
+                project_id,
+                ep_int,
+                user.into(),
                 auth_info.secret.clone(),
             );
             self.caches.project_info.insert_allowed_ips(
-                &project_id,
-                ep,
+                project_id,
+                ep_int,
                 Arc::new(auth_info.allowed_ips),
             );
             ctx.set_project_id(project_id);
@@ -217,15 +218,16 @@ impl super::Api for Api {
         let allowed_ips = Arc::new(auth_info.allowed_ips);
         let user = &user_info.user;
         if let Some(project_id) = auth_info.project_id {
+            let ep_int = ep.into();
             self.caches.project_info.insert_role_secret(
-                &project_id,
-                ep,
-                user,
+                project_id,
+                ep_int,
+                user.into(),
                 auth_info.secret.clone(),
             );
             self.caches
                 .project_info
-                .insert_allowed_ips(&project_id, ep, allowed_ips.clone());
+                .insert_allowed_ips(project_id, ep_int, allowed_ips.clone());
             ctx.set_project_id(project_id);
         }
         Ok((
@@ -248,8 +250,7 @@ impl super::Api for Api {
         // which means that we might cache it to reduce the load and latency.
         if let Some(cached) = self.caches.node_info.get(&key) {
             info!(key = &*key, "found cached compute node info");
-            info!("cold_start_info=warm");
-            ctx.set_cold_start_info(ColdStartInfo::Warm);
+            ctx.set_project(cached.aux.clone());
             return Ok(cached);
         }
 
@@ -260,17 +261,21 @@ impl super::Api for Api {
         if permit.should_check_cache() {
             if let Some(cached) = self.caches.node_info.get(&key) {
                 info!(key = &*key, "found cached compute node info");
-                info!("cold_start_info=warm");
-                ctx.set_cold_start_info(ColdStartInfo::Warm);
+                ctx.set_project(cached.aux.clone());
                 return Ok(cached);
             }
         }
 
-        let node = self.do_wake_compute(ctx, user_info).await?;
+        let mut node = self.do_wake_compute(ctx, user_info).await?;
         ctx.set_project(node.aux.clone());
-        let cold_start_info = node.aux.cold_start_info.clone().unwrap_or_default();
-        info!(?cold_start_info, "woken up a compute node");
-        let (_, cached) = self.caches.node_info.insert(key.clone(), node);
+        let cold_start_info = node.aux.cold_start_info;
+        info!("woken up a compute node");
+
+        // store the cached node as 'warm'
+        node.aux.cold_start_info = ColdStartInfo::WarmCached;
+        let (_, mut cached) = self.caches.node_info.insert(key.clone(), node);
+        cached.aux.cold_start_info = cold_start_info;
+
         info!(key = &*key, "created a cache entry for compute node info");
 
         Ok(cached)
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 7ca830cdb4..fec95f4722 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -11,8 +11,9 @@ use uuid::Uuid;
 use crate::{
     console::messages::{ColdStartInfo, MetricsAuxInfo},
     error::ErrorKind,
+    intern::{BranchIdInt, ProjectIdInt},
     metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
-    BranchId, DbName, EndpointId, ProjectId, RoleName,
+    DbName, EndpointId, RoleName,
 };
 
 use self::parquet::RequestData;
@@ -34,8 +35,8 @@ pub struct RequestMonitoring {
     pub span: Span,
 
     // filled in as they are discovered
-    project: Option<ProjectId>,
-    branch: Option<BranchId>,
+    project: Option<ProjectIdInt>,
+    branch: Option<BranchIdInt>,
     endpoint_id: Option<EndpointId>,
     dbname: Option<DbName>,
     user: Option<RoleName>,
@@ -43,7 +44,7 @@ pub struct RequestMonitoring {
     error_kind: Option<ErrorKind>,
     pub(crate) auth_method: Option<AuthMethod>,
     success: bool,
-    cold_start_info: Option<ColdStartInfo>,
+    pub(crate) cold_start_info: ColdStartInfo,
 
     // extra
     // This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -92,7 +93,7 @@ impl RequestMonitoring {
             error_kind: None,
             auth_method: None,
             success: false,
-            cold_start_info: None,
+            cold_start_info: ColdStartInfo::Unknown,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
             latency_timer: LatencyTimer::new(protocol),
@@ -113,26 +114,31 @@ impl RequestMonitoring {
     }
 
     pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
-        self.cold_start_info = Some(info);
+        self.cold_start_info = info;
+        self.latency_timer.cold_start_info(info);
     }
 
     pub fn set_project(&mut self, x: MetricsAuxInfo) {
-        self.set_endpoint_id(x.endpoint_id);
+        if self.endpoint_id.is_none() {
+            self.set_endpoint_id(x.endpoint_id.as_str().into())
+        }
         self.branch = Some(x.branch_id);
         self.project = Some(x.project_id);
-        self.cold_start_info = x.cold_start_info;
+        self.set_cold_start_info(x.cold_start_info);
     }
 
-    pub fn set_project_id(&mut self, project_id: ProjectId) {
+    pub fn set_project_id(&mut self, project_id: ProjectIdInt) {
         self.project = Some(project_id);
     }
 
     pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
-        self.span.record("ep", display(&endpoint_id));
-        crate::metrics::CONNECTING_ENDPOINTS
-            .with_label_values(&[self.protocol])
-            .measure(&endpoint_id);
-        self.endpoint_id = Some(endpoint_id);
+        if self.endpoint_id.is_none() {
+            self.span.record("ep", display(&endpoint_id));
+            crate::metrics::CONNECTING_ENDPOINTS
+                .with_label_values(&[self.protocol])
+                .measure(&endpoint_id);
+            self.endpoint_id = Some(endpoint_id);
+        }
     }
 
     pub fn set_application(&mut self, app: Option<SmolStr>) {
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index a2be1c4186..eb77409429 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -13,12 +13,14 @@ use parquet::{
     },
     record::RecordWriter,
 };
-use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel};
+use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use tokio::{sync::mpsc, time};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
 use utils::backoff;
 
+use crate::config::{remote_storage_from_toml, OptRemoteStorageConfig};
+
 use super::{RequestMonitoring, LOG_CHAN};
 
 #[derive(clap::Args, Clone, Debug)]
@@ -50,21 +52,13 @@ pub struct ParquetUploadArgs {
     parquet_upload_compression: Compression,
 }
 
-/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get
-/// runtime type errors from the value parser we use.
-type OptRemoteStorageConfig = Option<RemoteStorageConfig>;
-
-fn remote_storage_from_toml(s: &str) -> anyhow::Result<OptRemoteStorageConfig> {
-    RemoteStorageConfig::from_toml(&s.parse()?)
-}
-
 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a upload fails, we log it at info-level, and retry.
 // But after FAILED_UPLOAD_WARN_THRESHOLD retries, we start to log it at WARN
 // level instead, as repeated failures can mean a more serious problem. If it
 // fails more than FAILED_UPLOAD_RETRIES times, we give up
-pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
-pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
+pub const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
+pub const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
 
 // the parquet crate leaves a lot to be desired...
 // what follows is an attempt to write parquet files with minimal allocs.
@@ -93,7 +87,7 @@ pub struct RequestData {
     /// Or if we make it to proxy_pass
     success: bool,
     /// Indicates if the cplane started the new compute node for this request.
-    cold_start_info: Option<&'static str>,
+    cold_start_info: &'static str,
     /// Tracks time from session start (HTTP request/libpq TCP handshake)
     /// Through to success/failure
     duration_us: u64,
@@ -121,12 +115,7 @@ impl From<&RequestMonitoring> for RequestData {
             region: value.region,
             error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
             success: value.success,
-            cold_start_info: value.cold_start_info.as_ref().map(|x| match x {
-                crate::console::messages::ColdStartInfo::Unknown => "unknown",
-                crate::console::messages::ColdStartInfo::Warm => "warm",
-                crate::console::messages::ColdStartInfo::PoolHit => "pool_hit",
-                crate::console::messages::ColdStartInfo::PoolMiss => "pool_miss",
-            }),
+            cold_start_info: value.cold_start_info.as_str(),
             duration_us: SystemTime::from(value.first_packet)
                 .elapsed()
                 .unwrap_or_default()
@@ -460,7 +449,7 @@ mod tests {
             region: "us-east-1",
             error: None,
             success: rng.gen(),
-            cold_start_info: Some("no"),
+            cold_start_info: "no",
             duration_us: rng.gen_range(0..30_000_000),
         }
     }
@@ -530,15 +519,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1314406, 3, 6000),
-                (1314399, 3, 6000),
-                (1314459, 3, 6000),
-                (1314416, 3, 6000),
-                (1314546, 3, 6000),
-                (1314388, 3, 6000),
-                (1314180, 3, 6000),
-                (1314416, 3, 6000),
-                (438359, 1, 2000)
+                (1314385, 3, 6000),
+                (1314378, 3, 6000),
+                (1314438, 3, 6000),
+                (1314395, 3, 6000),
+                (1314525, 3, 6000),
+                (1314367, 3, 6000),
+                (1314159, 3, 6000),
+                (1314395, 3, 6000),
+                (438352, 1, 2000)
             ]
         );
 
@@ -568,11 +557,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1220668, 5, 10000),
-                (1226818, 5, 10000),
-                (1228612, 5, 10000),
-                (1227974, 5, 10000),
-                (1219252, 5, 10000)
+                (1220633, 5, 10000),
+                (1226783, 5, 10000),
+                (1228577, 5, 10000),
+                (1227939, 5, 10000),
+                (1219217, 5, 10000)
             ]
         );
 
@@ -604,11 +593,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1206315, 5, 10000),
-                (1206046, 5, 10000),
-                (1206339, 5, 10000),
-                (1206327, 5, 10000),
-                (1206582, 5, 10000)
+                (1206280, 5, 10000),
+                (1206011, 5, 10000),
+                (1206304, 5, 10000),
+                (1206292, 5, 10000),
+                (1206547, 5, 10000)
             ]
         );
 
@@ -633,15 +622,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1314406, 3, 6000),
-                (1314399, 3, 6000),
-                (1314459, 3, 6000),
-                (1314416, 3, 6000),
-                (1314546, 3, 6000),
-                (1314388, 3, 6000),
-                (1314180, 3, 6000),
-                (1314416, 3, 6000),
-                (438359, 1, 2000)
+                (1314385, 3, 6000),
+                (1314378, 3, 6000),
+                (1314438, 3, 6000),
+                (1314395, 3, 6000),
+                (1314525, 3, 6000),
+                (1314367, 3, 6000),
+                (1314159, 3, 6000),
+                (1314395, 3, 6000),
+                (438352, 1, 2000)
             ]
         );
 
@@ -678,7 +667,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(658837, 2, 3001), (658551, 2, 3000), (658347, 2, 2999)]
+            [(658823, 2, 3001), (658537, 2, 3000), (658333, 2, 2999)]
         );
 
         tmpdir.close().unwrap();
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 4172dc19da..59ee899c08 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -12,6 +12,8 @@ use metrics::{
 use once_cell::sync::Lazy;
 use tokio::time::{self, Instant};
 
+use crate::console::messages::ColdStartInfo;
+
 pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
     register_int_counter_pair_vec!(
         "proxy_opened_db_connections_total",
@@ -50,8 +52,8 @@ pub static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
         "proxy_compute_connection_latency_seconds",
         "Time it took for proxy to establish a connection to the compute endpoint",
         // http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane
-        // 3 * 2 * 2 * 2 * 2 = 48 counters
-        &["protocol", "cache_miss", "pool_miss", "outcome", "excluded"],
+        // 3 * 6 * 2 * 2 = 72 counters
+        &["protocol", "cold_start_info", "outcome", "excluded"],
         // largest bucket = 2^16 * 0.5ms = 32s
         exponential_buckets(0.0005, 2.0, 16).unwrap(),
     )
@@ -117,12 +119,15 @@ pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
     .unwrap()
 });
 
-pub static HTTP_CONTENT_LENGTH: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+pub static HTTP_CONTENT_LENGTH: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
         "proxy_http_conn_content_length_bytes",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // largest bucket = 3^16 * 0.05ms = 2.15s
-        exponential_buckets(8.0, 2.0, 20).unwrap()
+        "Number of bytes the HTTP response content consumes",
+        // request/response
+        &["direction"],
+        // smallest bucket = 16 bytes
+        // largest bucket = 4^12 * 16 bytes = 256MB
+        exponential_buckets(16.0, 4.0, 12).unwrap()
     )
     .unwrap()
 });
@@ -180,6 +185,20 @@ struct Accumulated {
     compute: time::Duration,
 }
 
+enum Outcome {
+    Success,
+    Failed,
+}
+
+impl Outcome {
+    fn as_str(&self) -> &'static str {
+        match self {
+            Outcome::Success => "success",
+            Outcome::Failed => "failed",
+        }
+    }
+}
+
 pub struct LatencyTimer {
     // time since the stopwatch was started
     start: time::Instant,
@@ -189,9 +208,8 @@ pub struct LatencyTimer {
     accumulated: Accumulated,
     // label data
     protocol: &'static str,
-    cache_miss: bool,
-    pool_miss: bool,
-    outcome: &'static str,
+    cold_start_info: ColdStartInfo,
+    outcome: Outcome,
 }
 
 pub struct LatencyTimerPause<'a> {
@@ -207,11 +225,9 @@ impl LatencyTimer {
             stop: None,
             accumulated: Accumulated::default(),
             protocol,
-            cache_miss: false,
-            // by default we don't do pooling
-            pool_miss: true,
+            cold_start_info: ColdStartInfo::Unknown,
             // assume failed unless otherwise specified
-            outcome: "failed",
+            outcome: Outcome::Failed,
         }
     }
 
@@ -223,12 +239,8 @@ impl LatencyTimer {
         }
     }
 
-    pub fn cache_miss(&mut self) {
-        self.cache_miss = true;
-    }
-
-    pub fn pool_hit(&mut self) {
-        self.pool_miss = false;
+    pub fn cold_start_info(&mut self, cold_start_info: ColdStartInfo) {
+        self.cold_start_info = cold_start_info;
     }
 
     pub fn success(&mut self) {
@@ -236,7 +248,7 @@ impl LatencyTimer {
         self.stop = Some(time::Instant::now());
 
         // success
-        self.outcome = "success";
+        self.outcome = Outcome::Success;
     }
 }
 
@@ -261,9 +273,8 @@ impl Drop for LatencyTimer {
         COMPUTE_CONNECTION_LATENCY
             .with_label_values(&[
                 self.protocol,
-                bool_to_str(self.cache_miss),
-                bool_to_str(self.pool_miss),
-                self.outcome,
+                self.cold_start_info.as_str(),
+                self.outcome.as_str(),
                 "client",
             ])
             .observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64());
@@ -272,9 +283,8 @@ impl Drop for LatencyTimer {
         COMPUTE_CONNECTION_LATENCY
             .with_label_values(&[
                 self.protocol,
-                bool_to_str(self.cache_miss),
-                bool_to_str(self.pool_miss),
-                self.outcome,
+                self.cold_start_info.as_str(),
+                self.outcome.as_str(),
                 "client_and_cplane",
             ])
             .observe((duration.saturating_sub(accumulated_total)).as_secs_f64());
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index c76e2ff6d9..4c0d68ce0b 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -87,7 +87,6 @@ impl ConnectMechanism for TcpMechanism<'_> {
 }
 
 /// Try to connect to the compute node, retrying if necessary.
-/// This function might update `node_info`, so we take it by `&mut`.
 #[tracing::instrument(skip_all)]
 pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
     ctx: &mut RequestMonitoring,
@@ -132,7 +131,6 @@ where
     } else {
         // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
         info!("compute node's state has likely changed; requesting a wake-up");
-        ctx.latency_timer.cache_miss();
         let old_node_info = invalidate_cache(node_info);
         let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
         node_info.reuse_settings(old_node_info);
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index f6d4314391..c81a1a8292 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -4,7 +4,7 @@ use crate::{
     console::messages::MetricsAuxInfo,
     metrics::NUM_BYTES_PROXIED_COUNTER,
     stream::Stream,
-    usage_metrics::{Ids, USAGE_METRICS},
+    usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS},
 };
 use metrics::IntCounterPairGuard;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -19,8 +19,8 @@ pub async fn proxy_pass(
     aux: MetricsAuxInfo,
 ) -> anyhow::Result<()> {
     let usage = USAGE_METRICS.register(Ids {
-        endpoint_id: aux.endpoint_id.clone(),
-        branch_id: aux.branch_id.clone(),
+        endpoint_id: aux.endpoint_id,
+        branch_id: aux.branch_id,
     });
 
     let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index a4051447c1..71d85e106d 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -12,11 +12,12 @@ use crate::auth::backend::{
 };
 use crate::config::CertResolver;
 use crate::console::caches::NodeInfoCache;
+use crate::console::messages::MetricsAuxInfo;
 use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
-use crate::{http, sasl, scram};
+use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId};
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use rstest::rstest;
@@ -512,7 +513,12 @@ impl TestBackend for TestConnectMechanism {
 fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
     let node = NodeInfo {
         config: compute::ConnCfg::new(),
-        aux: Default::default(),
+        aux: MetricsAuxInfo {
+            endpoint_id: (&EndpointId::from("endpoint")).into(),
+            project_id: (&ProjectId::from("project")).into(),
+            branch_id: (&BranchId::from("branch")).into(),
+            cold_start_info: crate::console::messages::ColdStartInfo::Warm,
+        },
         allow_self_signed_compute: false,
     };
     let (_, node) = cache.insert("key".into(), node);
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index f10779d7ba..8aa5ad4e8a 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -9,7 +9,6 @@ use crate::{
     config::ProxyConfig,
     console::{
         errors::{GetAuthInfoError, WakeComputeError},
-        messages::ColdStartInfo,
         CachedNodeInfo,
     },
     context::RequestMonitoring,
@@ -57,7 +56,10 @@ impl PoolingBackend {
         let auth_outcome =
             crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?;
         let res = match auth_outcome {
-            crate::sasl::Outcome::Success(key) => Ok(key),
+            crate::sasl::Outcome::Success(key) => {
+                info!("user successfully authenticated");
+                Ok(key)
+            }
             crate::sasl::Outcome::Failure(reason) => {
                 info!("auth backend failed with an error: {reason}");
                 Err(AuthError::auth_failed(&*conn_info.user_info.user))
@@ -89,8 +91,6 @@ impl PoolingBackend {
         };
 
         if let Some(client) = maybe_client {
-            info!("cold_start_info=warm");
-            ctx.set_cold_start_info(ColdStartInfo::Warm);
             return Ok(client);
         }
         let conn_id = uuid::Uuid::new_v4();
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index c7e8eaef76..35311facb8 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -17,7 +17,7 @@ use tokio::time::Instant;
 use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
 
-use crate::console::messages::MetricsAuxInfo;
+use crate::console::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
 use crate::{
@@ -383,9 +383,12 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
                     "pid",
                     &tracing::field::display(client.inner.get_process_id()),
                 );
-                info!("pool: reusing connection '{conn_info}'");
+                info!(
+                    cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
+                    "pool: reusing connection '{conn_info}'"
+                );
                 client.session.send(ctx.session_id)?;
-                ctx.latency_timer.pool_hit();
+                ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
                 ctx.latency_timer.success();
                 return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
             }
@@ -454,8 +457,9 @@ pub fn poll_client<C: ClientInnerExt>(
     let (tx, mut rx) = tokio::sync::watch::channel(session_id);
 
     let span = info_span!(parent: None, "connection", %conn_id);
+    let cold_start_info = ctx.cold_start_info;
     span.in_scope(|| {
-        info!(%conn_info, %session_id, "new connection");
+        info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection");
     });
     let pool = match conn_info.endpoint_cache_key() {
         Some(endpoint) => Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&endpoint)),
@@ -565,8 +569,8 @@ impl<C: ClientInnerExt> Client<C> {
     pub fn metrics(&self) -> Arc<MetricCounter> {
         let aux = &self.inner.as_ref().unwrap().aux;
         USAGE_METRICS.register(Ids {
-            endpoint_id: aux.endpoint_id.clone(),
-            branch_id: aux.branch_id.clone(),
+            endpoint_id: aux.endpoint_id,
+            branch_id: aux.branch_id,
         })
     }
 }
@@ -666,6 +670,8 @@ impl<C: ClientInnerExt> Drop for Client<C> {
 mod tests {
     use std::{mem, sync::atomic::AtomicBool};
 
+    use crate::{BranchId, EndpointId, ProjectId};
+
     use super::*;
 
     struct MockClient(Arc<AtomicBool>);
@@ -691,7 +697,12 @@ mod tests {
         ClientInner {
             inner: client,
             session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
-            aux: Default::default(),
+            aux: MetricsAuxInfo {
+                endpoint_id: (&EndpointId::from("endpoint")).into(),
+                project_id: (&ProjectId::from("project")).into(),
+                branch_id: (&BranchId::from("branch")).into(),
+                cold_start_info: crate::console::messages::ColdStartInfo::Warm,
+            },
             conn_id: uuid::Uuid::new_v4(),
         }
     }
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index f675375ff1..00dffd5784 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -42,12 +42,15 @@ use crate::error::ReportableError;
 use crate::error::UserFacingError;
 use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
+use crate::proxy::run_until_cancelled;
 use crate::proxy::NeonOptions;
 use crate::serverless::backend::HttpConnError;
+use crate::usage_metrics::MetricCounterRecorder;
 use crate::DbName;
 use crate::RoleName;
 
 use super::backend::PoolingBackend;
+use super::conn_pool::Client;
 use super::conn_pool::ConnInfo;
 use super::json::json_to_pg_text;
 use super::json::pg_text_row_to_json;
@@ -219,14 +222,7 @@ pub async fn handle(
     backend: Arc<PoolingBackend>,
     cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let cancel2 = cancel.clone();
-    let handle = tokio::spawn(async move {
-        time::sleep(config.http_config.request_timeout).await;
-        cancel2.cancel();
-    });
-
     let result = handle_inner(cancel, config, &mut ctx, request, backend).await;
-    handle.abort();
 
     let mut response = match result {
         Ok(r) => {
@@ -237,10 +233,7 @@ pub async fn handle(
             let error_kind = e.get_error_kind();
             ctx.set_error_kind(error_kind);
 
-            let message = format!(
-                "Query cancelled, runtime exceeded. SQL queries over HTTP must not exceed {} seconds of runtime. Please consider using our websocket based connections",
-                config.http_config.request_timeout.as_secs_f64()
-            );
+            let message = "Query cancelled, connection was terminated";
 
             tracing::info!(
                 kind=error_kind.to_metric_label(),
@@ -434,6 +427,63 @@ impl ReportableError for SqlOverHttpCancel {
     }
 }
 
+#[derive(Clone, Copy, Debug)]
+struct HttpHeaders {
+    raw_output: bool,
+    default_array_mode: bool,
+    txn_isolation_level: Option<IsolationLevel>,
+    txn_read_only: bool,
+    txn_deferrable: bool,
+}
+
+impl HttpHeaders {
+    fn try_parse(headers: &hyper::http::HeaderMap) -> Result<Self, SqlOverHttpError> {
+        // Determine the output options. Default behaviour is 'false'. Anything that is not
+        // strictly 'true' assumed to be false.
+        let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
+        let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
+
+        // isolation level, read only and deferrable
+        let txn_isolation_level = match headers.get(&TXN_ISOLATION_LEVEL) {
+            Some(x) => Some(
+                map_header_to_isolation_level(x).ok_or(SqlOverHttpError::InvalidIsolationLevel)?,
+            ),
+            None => None,
+        };
+
+        let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
+        let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);
+
+        Ok(Self {
+            raw_output,
+            default_array_mode,
+            txn_isolation_level,
+            txn_read_only,
+            txn_deferrable,
+        })
+    }
+}
+
+fn map_header_to_isolation_level(level: &HeaderValue) -> Option<IsolationLevel> {
+    match level.as_bytes() {
+        b"Serializable" => Some(IsolationLevel::Serializable),
+        b"ReadUncommitted" => Some(IsolationLevel::ReadUncommitted),
+        b"ReadCommitted" => Some(IsolationLevel::ReadCommitted),
+        b"RepeatableRead" => Some(IsolationLevel::RepeatableRead),
+        _ => None,
+    }
+}
+
+fn map_isolation_level_to_headers(level: IsolationLevel) -> Option<HeaderValue> {
+    match level {
+        IsolationLevel::ReadUncommitted => Some(HeaderValue::from_static("ReadUncommitted")),
+        IsolationLevel::ReadCommitted => Some(HeaderValue::from_static("ReadCommitted")),
+        IsolationLevel::RepeatableRead => Some(HeaderValue::from_static("RepeatableRead")),
+        IsolationLevel::Serializable => Some(HeaderValue::from_static("Serializable")),
+        _ => None,
+    }
+}
+
 async fn handle_inner(
     cancel: CancellationToken,
     config: &'static ProxyConfig,
@@ -450,43 +500,26 @@ async fn handle_inner(
     // Determine the destination and connection params
     //
     let headers = request.headers();
+
     // TLS config should be there.
     let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?;
     info!(user = conn_info.user_info.user.as_str(), "credentials");
 
-    // Determine the output options. Default behaviour is 'false'. Anything that is not
-    // strictly 'true' assumed to be false.
-    let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
-    let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
-
     // Allow connection pooling only if explicitly requested
     // or if we have decided that http pool is no longer opt-in
     let allow_pool = !config.http_config.pool_options.opt_in
         || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
 
-    // isolation level, read only and deferrable
-
-    let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned();
-    let txn_isolation_level = match txn_isolation_level_raw {
-        Some(ref x) => Some(match x.as_bytes() {
-            b"Serializable" => IsolationLevel::Serializable,
-            b"ReadUncommitted" => IsolationLevel::ReadUncommitted,
-            b"ReadCommitted" => IsolationLevel::ReadCommitted,
-            b"RepeatableRead" => IsolationLevel::RepeatableRead,
-            _ => return Err(SqlOverHttpError::InvalidIsolationLevel),
-        }),
-        None => None,
-    };
-
-    let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
-    let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);
+    let parsed_headers = HttpHeaders::try_parse(headers)?;
 
     let request_content_length = match request.body().size_hint().upper() {
         Some(v) => v,
         None => MAX_REQUEST_SIZE + 1,
     };
     info!(request_content_length, "request size in bytes");
-    HTTP_CONTENT_LENGTH.observe(request_content_length as f64);
+    HTTP_CONTENT_LENGTH
+        .with_label_values(&["request"])
+        .observe(request_content_length as f64);
 
     // we don't have a streaming request support yet so this is to prevent OOM
     // from a malicious user sending an extremely large request body
@@ -514,20 +547,18 @@ async fn handle_inner(
     }
     .map_err(SqlOverHttpError::from);
 
-    // Run both operations in parallel
-    let (payload, mut client) = match select(
+    let (payload, mut client) = match run_until_cancelled(
+        // Run both operations in parallel
         try_join(
             pin!(fetch_and_process_request),
             pin!(authenticate_and_connect),
         ),
-        pin!(cancel.cancelled()),
+        &cancel,
     )
     .await
     {
-        Either::Left((result, _cancelled)) => result?,
-        Either::Right((_cancelled, _)) => {
-            return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect))
-        }
+        Some(result) => result?,
+        None => return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect)),
     };
 
     let mut response = Response::builder()
@@ -537,95 +568,143 @@ async fn handle_inner(
     //
     // Now execute the query and return the result
     //
-    let mut size = 0;
     let result = match payload {
-        Payload::Single(stmt) => {
-            let mut size = 0;
-            let (inner, mut discard) = client.inner();
-            let cancel_token = inner.cancel_token();
-            let query = pin!(query_to_json(
-                &*inner,
-                stmt,
-                &mut size,
-                raw_output,
-                default_array_mode
-            ));
-            let cancelled = pin!(cancel.cancelled());
-            let res = select(query, cancelled).await;
-            match res {
-                Either::Left((Ok((status, results)), _cancelled)) => {
-                    discard.check_idle(status);
-                    results
-                }
-                Either::Left((Err(e), _cancelled)) => {
-                    discard.discard();
-                    return Err(e);
-                }
-                Either::Right((_cancelled, query)) => {
-                    if let Err(err) = cancel_token.cancel_query(NoTls).await {
-                        tracing::error!(?err, "could not cancel query");
-                    }
-                    match time::timeout(time::Duration::from_millis(100), query).await {
-                        Ok(Ok((status, results))) => {
-                            discard.check_idle(status);
-                            results
-                        }
-                        Ok(Err(error)) => {
-                            let db_error = match &error {
-                                SqlOverHttpError::ConnectCompute(
-                                    HttpConnError::ConnectionError(e),
-                                )
-                                | SqlOverHttpError::Postgres(e) => e.as_db_error(),
-                                _ => None,
-                            };
-
-                            // if errored for some other reason, it might not be safe to return
-                            if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) {
-                                discard.discard();
-                            }
-
-                            return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
-                        }
-                        Err(_timeout) => {
-                            discard.discard();
-                            return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
-                        }
-                    }
-                }
-            }
-        }
+        Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?,
         Payload::Batch(statements) => {
-            info!("starting transaction");
-            let (inner, mut discard) = client.inner();
-            let cancel_token = inner.cancel_token();
-            let mut builder = inner.build_transaction();
-            if let Some(isolation_level) = txn_isolation_level {
-                builder = builder.isolation_level(isolation_level);
+            if parsed_headers.txn_read_only {
+                response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE);
             }
-            if txn_read_only {
-                builder = builder.read_only(true);
+            if parsed_headers.txn_deferrable {
+                response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE);
             }
-            if txn_deferrable {
-                builder = builder.deferrable(true);
-            }
-
-            let transaction = builder.start().await.map_err(|e| {
-                // if we cannot start a transaction, we should return immediately
-                // and not return to the pool. connection is clearly broken
-                discard.discard();
-                e
-            })?;
-
-            let results = match query_batch(
-                cancel.child_token(),
-                &transaction,
-                statements,
-                &mut size,
-                raw_output,
-                default_array_mode,
-            )
-            .await
+            if let Some(txn_isolation_level) = parsed_headers
+                .txn_isolation_level
+                .and_then(map_isolation_level_to_headers)
             {
+                response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
+            }
+
+            statements
+                .process(cancel, &mut client, parsed_headers)
+                .await?
+        }
+    };
+
+    let metrics = client.metrics();
+
+    // how could this possibly fail
+    let body = serde_json::to_string(&result).expect("json serialization should not fail");
+    let len = body.len();
+    let response = response
+        .body(Body::from(body))
+        // only fails if invalid status code or invalid header/values are given.
+        // these are not user configurable so it cannot fail dynamically
+        .expect("building response payload should not fail");
+
+    // count the egress bytes - we miss the TLS and header overhead but oh well...
+    // moving this later in the stack is going to be a lot of effort and ehhhh
+    metrics.record_egress(len as u64);
+    HTTP_CONTENT_LENGTH
+        .with_label_values(&["response"])
+        .observe(len as f64);
+
+    Ok(response)
+}
+
+impl QueryData {
+    async fn process(
+        self,
+        cancel: CancellationToken,
+        client: &mut Client<tokio_postgres::Client>,
+        parsed_headers: HttpHeaders,
+    ) -> Result<Value, SqlOverHttpError> {
+        let (inner, mut discard) = client.inner();
+        let cancel_token = inner.cancel_token();
+
+        let res = match select(
+            pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)),
+            pin!(cancel.cancelled()),
+        )
+        .await
+        {
+            // The query successfully completed.
+            Either::Left((Ok((status, results)), __not_yet_cancelled)) => {
+                discard.check_idle(status);
+                Ok(results)
+            }
+            // The query failed with an error
+            Either::Left((Err(e), __not_yet_cancelled)) => {
+                discard.discard();
+                return Err(e);
+            }
+            // The query was cancelled.
+            Either::Right((_cancelled, query)) => {
+                if let Err(err) = cancel_token.cancel_query(NoTls).await {
+                    tracing::error!(?err, "could not cancel query");
+                }
+                // wait for the query cancellation
+                match time::timeout(time::Duration::from_millis(100), query).await {
+                    // query successed before it was cancelled.
+                    Ok(Ok((status, results))) => {
+                        discard.check_idle(status);
+                        Ok(results)
+                    }
+                    // query failed or was cancelled.
+                    Ok(Err(error)) => {
+                        let db_error = match &error {
+                            SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
+                            | SqlOverHttpError::Postgres(e) => e.as_db_error(),
+                            _ => None,
+                        };
+
+                        // if errored for some other reason, it might not be safe to return
+                        if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) {
+                            discard.discard();
+                        }
+
+                        Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres))
+                    }
+                    Err(_timeout) => {
+                        discard.discard();
+                        Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres))
+                    }
+                }
+            }
+        };
+        res
+    }
+}
+
+impl BatchQueryData {
+    async fn process(
+        self,
+        cancel: CancellationToken,
+        client: &mut Client<tokio_postgres::Client>,
+        parsed_headers: HttpHeaders,
+    ) -> Result<Value, SqlOverHttpError> {
+        info!("starting transaction");
+        let (inner, mut discard) = client.inner();
+        let cancel_token = inner.cancel_token();
+        let mut builder = inner.build_transaction();
+        if let Some(isolation_level) = parsed_headers.txn_isolation_level {
+            builder = builder.isolation_level(isolation_level);
+        }
+        if parsed_headers.txn_read_only {
+            builder = builder.read_only(true);
+        }
+        if parsed_headers.txn_deferrable {
+            builder = builder.deferrable(true);
+        }
+
+        let transaction = builder.start().await.map_err(|e| {
+            // if we cannot start a transaction, we should return immediately
+            // and not return to the pool. connection is clearly broken
+            discard.discard();
+            e
+        })?;
+
+        let results =
+            match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
                 Ok(results) => {
                     info!("commit");
                     let status = transaction.commit().await.map_err(|e| {
@@ -659,44 +738,15 @@ async fn handle_inner(
                 }
             };
 
-            if txn_read_only {
-                response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE);
-            }
-            if txn_deferrable {
-                response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE);
-            }
-            if let Some(txn_isolation_level) = txn_isolation_level_raw {
-                response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
-            }
-            json!({ "results": results })
-        }
-    };
-
-    let metrics = client.metrics();
-
-    // how could this possibly fail
-    let body = serde_json::to_string(&result).expect("json serialization should not fail");
-    let len = body.len();
-    let response = response
-        .body(Body::from(body))
-        // only fails if invalid status code or invalid header/values are given.
-        // these are not user configurable so it cannot fail dynamically
-        .expect("building response payload should not fail");
-
-    // count the egress bytes - we miss the TLS and header overhead but oh well...
-    // moving this later in the stack is going to be a lot of effort and ehhhh
-    metrics.record_egress(len as u64);
-
-    Ok(response)
+        Ok(json!({ "results": results }))
+    }
 }
 
 async fn query_batch(
     cancel: CancellationToken,
     transaction: &Transaction<'_>,
     queries: BatchQueryData,
-    total_size: &mut usize,
-    raw_output: bool,
-    array_mode: bool,
+    parsed_headers: HttpHeaders,
 ) -> Result<Vec<Value>, SqlOverHttpError> {
     let mut results = Vec::with_capacity(queries.queries.len());
     let mut current_size = 0;
@@ -705,8 +755,7 @@ async fn query_batch(
             transaction,
             stmt,
             &mut current_size,
-            raw_output,
-            array_mode
+            parsed_headers,
         ));
         let cancelled = pin!(cancel.cancelled());
         let res = select(query, cancelled).await;
@@ -723,7 +772,6 @@ async fn query_batch(
             }
         }
     }
-    *total_size += current_size;
     Ok(results)
 }
 
@@ -731,8 +779,7 @@ async fn query_to_json<T: GenericClient>(
     client: &T,
     data: QueryData,
     current_size: &mut usize,
-    raw_output: bool,
-    default_array_mode: bool,
+    parsed_headers: HttpHeaders,
 ) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> {
     info!("executing query");
     let query_params = data.params;
@@ -792,12 +839,12 @@ async fn query_to_json<T: GenericClient>(
         columns.push(client.get_type(c.type_oid()).await?);
     }
 
-    let array_mode = data.array_mode.unwrap_or(default_array_mode);
+    let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode);
 
     // convert rows to JSON
     let rows = rows
         .iter()
-        .map(|row| pg_text_row_to_json(row, &columns, raw_output, array_mode))
+        .map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode))
         .collect::<Result<Vec<_>, _>>()?;
 
     // resulting JSON format is based on the format of node-postgres result
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index d75aedf89b..5ffbf95c07 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -1,20 +1,35 @@
 //! Periodically collect proxy consumption metrics
 //! and push them to a HTTP endpoint.
-use crate::{config::MetricCollectionConfig, http, BranchId, EndpointId};
-use chrono::{DateTime, Utc};
+use crate::{
+    config::{MetricBackupCollectionConfig, MetricCollectionConfig},
+    context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
+    http,
+    intern::{BranchIdInt, EndpointIdInt},
+};
+use anyhow::Context;
+use async_compression::tokio::write::GzipEncoder;
+use bytes::Bytes;
+use chrono::{DateTime, Datelike, Timelike, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use dashmap::{mapref::entry::Entry, DashMap};
+use futures::future::select;
 use once_cell::sync::Lazy;
+use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use serde::{Deserialize, Serialize};
 use std::{
     convert::Infallible,
+    pin::pin,
     sync::{
         atomic::{AtomicU64, AtomicUsize, Ordering},
         Arc,
     },
     time::Duration,
 };
+use tokio::io::AsyncWriteExt;
+use tokio_util::sync::CancellationToken;
 use tracing::{error, info, instrument, trace};
+use utils::backoff;
+use uuid::{NoContext, Timestamp};
 
 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
 
@@ -29,23 +44,97 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 /// because we enrich the event with project_id in the control-plane endpoint.
 #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
 pub struct Ids {
-    pub endpoint_id: EndpointId,
-    pub branch_id: BranchId,
+    pub endpoint_id: EndpointIdInt,
+    pub branch_id: BranchIdInt,
+}
+
+pub trait MetricCounterRecorder {
+    /// Record that some bytes were sent from the proxy to the client
+    fn record_egress(&self, bytes: u64);
+    /// Record that some connections were opened
+    fn record_connection(&self, count: usize);
+}
+
+trait MetricCounterReporter {
+    fn get_metrics(&mut self) -> (u64, usize);
+    fn move_metrics(&self) -> (u64, usize);
+}
+
+#[derive(Debug)]
+struct MetricBackupCounter {
+    transmitted: AtomicU64,
+    opened_connections: AtomicUsize,
+}
+
+impl MetricCounterRecorder for MetricBackupCounter {
+    fn record_egress(&self, bytes: u64) {
+        self.transmitted.fetch_add(bytes, Ordering::AcqRel);
+    }
+
+    fn record_connection(&self, count: usize) {
+        self.opened_connections.fetch_add(count, Ordering::AcqRel);
+    }
+}
+
+impl MetricCounterReporter for MetricBackupCounter {
+    fn get_metrics(&mut self) -> (u64, usize) {
+        (
+            *self.transmitted.get_mut(),
+            *self.opened_connections.get_mut(),
+        )
+    }
+    fn move_metrics(&self) -> (u64, usize) {
+        (
+            self.transmitted.swap(0, Ordering::AcqRel),
+            self.opened_connections.swap(0, Ordering::AcqRel),
+        )
+    }
 }
 
 #[derive(Debug)]
 pub struct MetricCounter {
     transmitted: AtomicU64,
     opened_connections: AtomicUsize,
+    backup: Arc<MetricBackupCounter>,
 }
 
-impl MetricCounter {
+impl MetricCounterRecorder for MetricCounter {
     /// Record that some bytes were sent from the proxy to the client
-    pub fn record_egress(&self, bytes: u64) {
+    fn record_egress(&self, bytes: u64) {
         self.transmitted.fetch_add(bytes, Ordering::AcqRel);
+        self.backup.record_egress(bytes);
     }
 
+    /// Record that some connections were opened
+    fn record_connection(&self, count: usize) {
+        self.opened_connections.fetch_add(count, Ordering::AcqRel);
+        self.backup.record_connection(count);
+    }
+}
+
+impl MetricCounterReporter for MetricCounter {
+    fn get_metrics(&mut self) -> (u64, usize) {
+        (
+            *self.transmitted.get_mut(),
+            *self.opened_connections.get_mut(),
+        )
+    }
+    fn move_metrics(&self) -> (u64, usize) {
+        (
+            self.transmitted.swap(0, Ordering::AcqRel),
+            self.opened_connections.swap(0, Ordering::AcqRel),
+        )
+    }
+}
+
+trait Clearable {
     /// extract the value that should be reported
+    fn should_report(self: &Arc<Self>) -> Option<u64>;
+    /// Determine whether the counter should be cleared from the global map.
+    fn should_clear(self: &mut Arc<Self>) -> bool;
+}
+
+impl<C: MetricCounterReporter> Clearable for C {
     fn should_report(self: &Arc<Self>) -> Option<u64> {
         // heuristic to see if the branch is still open
         // if a clone happens while we are observing, the heuristic will be incorrect.
@@ -54,13 +143,12 @@ impl MetricCounter {
         // However, for the strong count to be 1 it must have occured that at one instant
         // all the endpoints were closed, so missing a report because the endpoints are closed is valid.
         let is_open = Arc::strong_count(self) > 1;
-        let opened = self.opened_connections.swap(0, Ordering::AcqRel);
 
         // update cached metrics eagerly, even if they can't get sent
         // (to avoid sending the same metrics twice)
         // see the relevant discussion on why to do so even if the status is not success:
         // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956
-        let value = self.transmitted.swap(0, Ordering::AcqRel);
+        let (value, opened) = self.move_metrics();
 
         // Our only requirement is that we report in every interval if there was an open connection
         // if there were no opened connections since, then we don't need to report
@@ -70,15 +158,12 @@ impl MetricCounter {
             Some(value)
         }
     }
-
-    /// Determine whether the counter should be cleared from the global map.
     fn should_clear(self: &mut Arc<Self>) -> bool {
         // we can't clear this entry if it's acquired elsewhere
         let Some(counter) = Arc::get_mut(self) else {
             return false;
         };
-        let opened = *counter.opened_connections.get_mut();
-        let value = *counter.transmitted.get_mut();
+        let (opened, value) = counter.get_metrics();
         // clear if there's no data to report
         value == 0 && opened == 0
     }
@@ -90,11 +175,26 @@ type FastHasher = std::hash::BuildHasherDefault<rustc_hash::FxHasher>;
 #[derive(Default)]
 pub struct Metrics {
     endpoints: DashMap<Ids, Arc<MetricCounter>, FastHasher>,
+    backup_endpoints: DashMap<Ids, Arc<MetricBackupCounter>, FastHasher>,
 }
 
 impl Metrics {
     /// Register a new byte metrics counter for this endpoint
     pub fn register(&self, ids: Ids) -> Arc<MetricCounter> {
+        let backup = if let Some(entry) = self.backup_endpoints.get(&ids) {
+            entry.clone()
+        } else {
+            self.backup_endpoints
+                .entry(ids.clone())
+                .or_insert_with(|| {
+                    Arc::new(MetricBackupCounter {
+                        transmitted: AtomicU64::new(0),
+                        opened_connections: AtomicUsize::new(0),
+                    })
+                })
+                .clone()
+        };
+
         let entry = if let Some(entry) = self.endpoints.get(&ids) {
             entry.clone()
         } else {
@@ -104,12 +204,13 @@ impl Metrics {
                     Arc::new(MetricCounter {
                         transmitted: AtomicU64::new(0),
                         opened_connections: AtomicUsize::new(0),
+                        backup: backup.clone(),
                     })
                 })
                 .clone()
         };
 
-        entry.opened_connections.fetch_add(1, Ordering::AcqRel);
+        entry.record_connection(1);
         entry
     }
 }
@@ -132,7 +233,7 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
 
         let now = Utc::now();
         collect_metrics_iteration(
-            &USAGE_METRICS,
+            &USAGE_METRICS.endpoints,
             &http_client,
             &config.endpoint,
             &hostname,
@@ -144,24 +245,12 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
     }
 }
 
-#[instrument(skip_all)]
-async fn collect_metrics_iteration(
-    metrics: &Metrics,
-    client: &http::ClientWithMiddleware,
-    metric_collection_endpoint: &reqwest::Url,
-    hostname: &str,
-    prev: DateTime<Utc>,
-    now: DateTime<Utc>,
-) {
-    info!(
-        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
-        metric_collection_endpoint
-    );
-
+fn collect_and_clear_metrics<C: Clearable>(
+    endpoints: &DashMap<Ids, Arc<C>, FastHasher>,
+) -> Vec<(Ids, u64)> {
     let mut metrics_to_clear = Vec::new();
 
-    let metrics_to_send: Vec<(Ids, u64)> = metrics
-        .endpoints
+    let metrics_to_send: Vec<(Ids, u64)> = endpoints
         .iter()
         .filter_map(|counter| {
             let key = counter.key().clone();
@@ -173,33 +262,71 @@ async fn collect_metrics_iteration(
         })
         .collect();
 
+    for metric in metrics_to_clear {
+        match endpoints.entry(metric) {
+            Entry::Occupied(mut counter) => {
+                if counter.get_mut().should_clear() {
+                    counter.remove_entry();
+                }
+            }
+            Entry::Vacant(_) => {}
+        }
+    }
+    metrics_to_send
+}
+
+fn create_event_chunks<'a>(
+    metrics_to_send: &'a [(Ids, u64)],
+    hostname: &'a str,
+    prev: DateTime<Utc>,
+    now: DateTime<Utc>,
+    chunk_size: usize,
+) -> impl Iterator<Item = EventChunk<'a, Event<Ids, &'static str>>> + 'a {
+    // Split into chunks of 1000 metrics to avoid exceeding the max request size
+    metrics_to_send
+        .chunks(chunk_size)
+        .map(move |chunk| EventChunk {
+            events: chunk
+                .iter()
+                .map(|(ids, value)| Event {
+                    kind: EventType::Incremental {
+                        start_time: prev,
+                        stop_time: now,
+                    },
+                    metric: PROXY_IO_BYTES_PER_CLIENT,
+                    idempotency_key: idempotency_key(hostname),
+                    value: *value,
+                    extra: ids.clone(),
+                })
+                .collect(),
+        })
+}
+
+#[instrument(skip_all)]
+async fn collect_metrics_iteration(
+    endpoints: &DashMap<Ids, Arc<MetricCounter>, FastHasher>,
+    client: &http::ClientWithMiddleware,
+    metric_collection_endpoint: &reqwest::Url,
+    hostname: &str,
+    prev: DateTime<Utc>,
+    now: DateTime<Utc>,
+) {
+    info!(
+        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
+        metric_collection_endpoint
+    );
+
+    let metrics_to_send = collect_and_clear_metrics(endpoints);
+
     if metrics_to_send.is_empty() {
         trace!("no new metrics to send");
     }
 
     // Send metrics.
-    // Split into chunks of 1000 metrics to avoid exceeding the max request size
-    for chunk in metrics_to_send.chunks(CHUNK_SIZE) {
-        let events = chunk
-            .iter()
-            .map(|(ids, value)| Event {
-                kind: EventType::Incremental {
-                    start_time: prev,
-                    stop_time: now,
-                },
-                metric: PROXY_IO_BYTES_PER_CLIENT,
-                idempotency_key: idempotency_key(hostname),
-                value: *value,
-                extra: Ids {
-                    endpoint_id: ids.endpoint_id.clone(),
-                    branch_id: ids.branch_id.clone(),
-                },
-            })
-            .collect();
-
+    for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, CHUNK_SIZE) {
         let res = client
             .post(metric_collection_endpoint.clone())
-            .json(&EventChunk { events })
+            .json(&chunk)
             .send()
             .await;
 
@@ -213,23 +340,142 @@ async fn collect_metrics_iteration(
 
         if !res.status().is_success() {
             error!("metrics endpoint refused the sent metrics: {:?}", res);
-            for metric in chunk.iter().filter(|(_, value)| *value > (1u64 << 40)) {
+            for metric in chunk.events.iter().filter(|e| e.value > (1u64 << 40)) {
                 // Report if the metric value is suspiciously large
                 error!("potentially abnormal metric value: {:?}", metric);
             }
         }
     }
+}
 
-    for metric in metrics_to_clear {
-        match metrics.endpoints.entry(metric) {
-            Entry::Occupied(mut counter) => {
-                if counter.get_mut().should_clear() {
-                    counter.remove_entry();
-                }
-            }
-            Entry::Vacant(_) => {}
+pub async fn task_backup(
+    backup_config: &MetricBackupCollectionConfig,
+    cancellation_token: CancellationToken,
+) -> anyhow::Result<()> {
+    info!("metrics backup config: {backup_config:?}");
+    scopeguard::defer! {
+        info!("metrics backup has shut down");
+    }
+    // Even if the remote storage is not configured, we still want to clear the metrics.
+    let storage = backup_config
+        .remote_storage_config
+        .as_ref()
+        .map(|config| GenericRemoteStorage::from_config(config).context("remote storage init"))
+        .transpose()?;
+    let mut ticker = tokio::time::interval(backup_config.interval);
+    let mut prev = Utc::now();
+    let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
+    loop {
+        select(pin!(ticker.tick()), pin!(cancellation_token.cancelled())).await;
+        let now = Utc::now();
+        collect_metrics_backup_iteration(
+            &USAGE_METRICS.backup_endpoints,
+            &storage,
+            &hostname,
+            prev,
+            now,
+            backup_config.chunk_size,
+        )
+        .await;
+
+        prev = now;
+        if cancellation_token.is_cancelled() {
+            info!("metrics backup has been cancelled");
+            break;
         }
     }
+    Ok(())
+}
+
+#[instrument(skip_all)]
+async fn collect_metrics_backup_iteration(
+    endpoints: &DashMap<Ids, Arc<MetricBackupCounter>, FastHasher>,
+    storage: &Option<GenericRemoteStorage>,
+    hostname: &str,
+    prev: DateTime<Utc>,
+    now: DateTime<Utc>,
+    chunk_size: usize,
+) {
+    let year = now.year();
+    let month = now.month();
+    let day = now.day();
+    let hour = now.hour();
+    let minute = now.minute();
+    let second = now.second();
+    let cancel = CancellationToken::new();
+
+    info!("starting collect_metrics_backup_iteration");
+
+    let metrics_to_send = collect_and_clear_metrics(endpoints);
+
+    if metrics_to_send.is_empty() {
+        trace!("no new metrics to send");
+    }
+
+    // Send metrics.
+    for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, chunk_size) {
+        let real_now = Utc::now();
+        let id = uuid::Uuid::new_v7(Timestamp::from_unix(
+            NoContext,
+            real_now.second().into(),
+            real_now.nanosecond(),
+        ));
+        let path = format!("year={year:04}/month={month:02}/day={day:02}/{hour:02}:{minute:02}:{second:02}Z_{id}.json.gz");
+        let remote_path = match RemotePath::from_string(&path) {
+            Ok(remote_path) => remote_path,
+            Err(e) => {
+                error!("failed to create remote path from str {path}: {:?}", e);
+                continue;
+            }
+        };
+
+        let res = upload_events_chunk(storage, chunk, &remote_path, &cancel).await;
+
+        if let Err(e) = res {
+            error!(
+                "failed to upload consumption events to remote storage: {:?}",
+                e
+            );
+        }
+    }
+}
+
+async fn upload_events_chunk(
+    storage: &Option<GenericRemoteStorage>,
+    chunk: EventChunk<'_, Event<Ids, &'static str>>,
+    remote_path: &RemotePath,
+    cancel: &CancellationToken,
+) -> anyhow::Result<()> {
+    let storage = match storage {
+        Some(storage) => storage,
+        None => {
+            error!("no remote storage configured");
+            return Ok(());
+        }
+    };
+    let data = serde_json::to_vec(&chunk).context("serialize metrics")?;
+    let mut encoder = GzipEncoder::new(Vec::new());
+    encoder.write_all(&data).await.context("compress metrics")?;
+    encoder.shutdown().await.context("compress metrics")?;
+    let compressed_data: Bytes = encoder.get_ref().clone().into();
+    backoff::retry(
+        || async {
+            let stream = futures::stream::once(futures::future::ready(Ok(compressed_data.clone())));
+            storage
+                .upload(stream, compressed_data.len(), remote_path, None, cancel)
+                .await
+        },
+        TimeoutOrCancel::caused_by_cancel,
+        FAILED_UPLOAD_WARN_THRESHOLD,
+        FAILED_UPLOAD_MAX_RETRIES,
+        "request_data_upload",
+        cancel,
+    )
+    .await
+    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
+    .and_then(|x| x)
+    .context("request_data_upload")?;
+    Ok(())
 }
 
 #[cfg(test)]
@@ -248,8 +494,8 @@ mod tests {
     };
     use url::Url;
 
-    use super::{collect_metrics_iteration, Ids, Metrics};
-    use crate::{http, rate_limiter::RateLimiterConfig};
+    use super::*;
+    use crate::{http, rate_limiter::RateLimiterConfig, BranchId, EndpointId};
 
     #[tokio::test]
     async fn metrics() {
@@ -284,18 +530,19 @@ mod tests {
         let now = Utc::now();
 
         // no counters have been registered
-        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
         let r = std::mem::take(&mut *reports2.lock().unwrap());
         assert!(r.is_empty());
 
         // register a new counter
+
         let counter = metrics.register(Ids {
-            endpoint_id: "e1".into(),
-            branch_id: "b1".into(),
+            endpoint_id: (&EndpointId::from("e1")).into(),
+            branch_id: (&BranchId::from("b1")).into(),
         });
 
         // the counter should be observed despite 0 egress
-        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
         let r = std::mem::take(&mut *reports2.lock().unwrap());
         assert_eq!(r.len(), 1);
         assert_eq!(r[0].events.len(), 1);
@@ -305,7 +552,7 @@ mod tests {
         counter.record_egress(1);
 
         // egress should be observered
-        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
         let r = std::mem::take(&mut *reports2.lock().unwrap());
         assert_eq!(r.len(), 1);
         assert_eq!(r[0].events.len(), 1);
@@ -315,11 +562,19 @@ mod tests {
         drop(counter);
 
         // we do not observe the counter
-        collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
         let r = std::mem::take(&mut *reports2.lock().unwrap());
         assert!(r.is_empty());
 
         // counter is unregistered
         assert!(metrics.endpoints.is_empty());
+
+        collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000)
+            .await;
+        assert!(!metrics.backup_endpoints.is_empty());
+        collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000)
+            .await;
+        // backup counter is unregistered after the second iteration
+        assert!(metrics.backup_endpoints.is_empty());
     }
 }
diff --git a/pyproject.toml b/pyproject.toml
index e347d47cbf..156f135062 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -94,4 +94,5 @@ select = [
     "I", # isort
     "W", # pycodestyle
     "B", # bugbear
+    "UP032", # f-string
 ]
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index cb4a1def1f..c8b732fee1 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -33,6 +33,7 @@ once_cell.workspace = true
 parking_lot.workspace = true
 postgres.workspace = true
 postgres-protocol.workspace = true
+rand.workspace = true
 regex.workspace = true
 scopeguard.workspace = true
 reqwest = { workspace = true, features = ["json"] }
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 3c4c81e499..e53ccaeb3d 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -28,7 +28,7 @@ use utils::pid_file;
 use metrics::set_build_info_metric;
 use safekeeper::defaults::{
     DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
-    DEFAULT_PG_LISTEN_ADDR,
+    DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
 };
 use safekeeper::wal_service;
 use safekeeper::GlobalTimelines;
@@ -170,6 +170,13 @@ struct Args {
     /// still needed for existing replication connection.
     #[arg(long)]
     walsenders_keep_horizon: bool,
+    /// Enable partial backup. If disabled, safekeeper will not upload partial
+    /// segments to remote storage.
+    #[arg(long)]
+    partial_backup_enabled: bool,
+    /// Controls how long backup will wait until uploading the partial segment.
+    #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
+    partial_backup_timeout: Duration,
 }
 
 // Like PathBufValueParser, but allows empty string.
@@ -300,6 +307,8 @@ async fn main() -> anyhow::Result<()> {
         http_auth,
         current_thread_runtime: args.current_thread_runtime,
         walsenders_keep_horizon: args.walsenders_keep_horizon,
+        partial_backup_enabled: args.partial_backup_enabled,
+        partial_backup_timeout: args.partial_backup_timeout,
     };
 
     // initialize sentry if SENTRY_DSN is provided
@@ -365,6 +374,8 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
 
     let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
 
+    wal_backup::init_remote_storage(&conf);
+
     // Keep handles to main tasks to die if any of them disappears.
     let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
         FuturesUnordered::new();
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index d822c87c0e..fe9f2e6899 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -20,7 +20,7 @@ use utils::{bin_ser::LeSer, id::TenantTimelineId};
 use crate::SafeKeeperConf;
 
 pub const SK_MAGIC: u32 = 0xcafeceefu32;
-pub const SK_FORMAT_VERSION: u32 = 7;
+pub const SK_FORMAT_VERSION: u32 = 8;
 
 // contains persistent metadata for safekeeper
 const CONTROL_FILE_NAME: &str = "safekeeper.control";
diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs
index 2fd719326d..8f4dfe9b43 100644
--- a/safekeeper/src/control_file_upgrade.rs
+++ b/safekeeper/src/control_file_upgrade.rs
@@ -2,6 +2,7 @@
 use crate::{
     safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn},
     state::{PersistedPeers, TimelinePersistentState},
+    wal_backup_partial,
 };
 use anyhow::{bail, Result};
 use pq_proto::SystemId;
@@ -138,6 +139,50 @@ pub struct SafeKeeperStateV4 {
     pub peers: PersistedPeers,
 }
 
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct SafeKeeperStateV7 {
+    #[serde(with = "hex")]
+    pub tenant_id: TenantId,
+    #[serde(with = "hex")]
+    pub timeline_id: TimelineId,
+    /// persistent acceptor state
+    pub acceptor_state: AcceptorState,
+    /// information about server
+    pub server: ServerInfo,
+    /// Unique id of the last *elected* proposer we dealt with. Not needed
+    /// for correctness, exists for monitoring purposes.
+    #[serde(with = "hex")]
+    pub proposer_uuid: PgUuid,
+    /// Since which LSN this timeline generally starts. Safekeeper might have
+    /// joined later.
+    pub timeline_start_lsn: Lsn,
+    /// Since which LSN safekeeper has (had) WAL for this timeline.
+    /// All WAL segments next to one containing local_start_lsn are
+    /// filled with data from the beginning.
+    pub local_start_lsn: Lsn,
+    /// Part of WAL acknowledged by quorum *and available locally*. Always points
+    /// to record boundary.
+    pub commit_lsn: Lsn,
+    /// LSN that points to the end of the last backed up segment. Useful to
+    /// persist to avoid finding out offloading progress on boot.
+    pub backup_lsn: Lsn,
+    /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
+    /// of last record streamed to everyone). Persisting it helps skipping
+    /// recovery in walproposer, generally we compute it from peers. In
+    /// walproposer proto called 'truncate_lsn'. Updates are currently drived
+    /// only by walproposer.
+    pub peer_horizon_lsn: Lsn,
+    /// LSN of the oldest known checkpoint made by pageserver and successfully
+    /// pushed to s3. We don't remove WAL beyond it. Persisted only for
+    /// informational purposes, we receive it from pageserver (or broker).
+    pub remote_consistent_lsn: Lsn,
+    // Peers and their state as we remember it. Knowing peers themselves is
+    // fundamental; but state is saved here only for informational purposes and
+    // obviously can be stale. (Currently not saved at all, but let's provision
+    // place to have less file version upgrades).
+    pub peers: PersistedPeers,
+}
+
 pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersistentState> {
     // migrate to storing full term history
     if version == 1 {
@@ -167,6 +212,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             peer_horizon_lsn: oldstate.truncate_lsn,
             remote_consistent_lsn: Lsn(0),
             peers: PersistedPeers(vec![]),
+            partial_backup: wal_backup_partial::State::default(),
         });
     // migrate to hexing some ids
     } else if version == 2 {
@@ -190,6 +236,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             peer_horizon_lsn: oldstate.truncate_lsn,
             remote_consistent_lsn: Lsn(0),
             peers: PersistedPeers(vec![]),
+            partial_backup: wal_backup_partial::State::default(),
         });
     // migrate to moving tenant_id/timeline_id to the top and adding some lsns
     } else if version == 3 {
@@ -213,6 +260,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             peer_horizon_lsn: oldstate.truncate_lsn,
             remote_consistent_lsn: Lsn(0),
             peers: PersistedPeers(vec![]),
+            partial_backup: wal_backup_partial::State::default(),
         });
     // migrate to having timeline_start_lsn
     } else if version == 4 {
@@ -236,6 +284,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             peer_horizon_lsn: oldstate.peer_horizon_lsn,
             remote_consistent_lsn: Lsn(0),
             peers: PersistedPeers(vec![]),
+            partial_backup: wal_backup_partial::State::default(),
         });
     } else if version == 5 {
         info!("reading safekeeper control file version {}", version);
@@ -262,7 +311,30 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
         oldstate.server.pg_version = 140005;
 
         return Ok(oldstate);
+    } else if version == 7 {
+        info!("reading safekeeper control file version {}", version);
+        let oldstate = SafeKeeperStateV7::des(&buf[..buf.len()])?;
+
+        return Ok(TimelinePersistentState {
+            tenant_id: oldstate.tenant_id,
+            timeline_id: oldstate.timeline_id,
+            acceptor_state: oldstate.acceptor_state,
+            server: oldstate.server,
+            proposer_uuid: oldstate.proposer_uuid,
+            timeline_start_lsn: oldstate.timeline_start_lsn,
+            local_start_lsn: oldstate.local_start_lsn,
+            commit_lsn: oldstate.commit_lsn,
+            backup_lsn: oldstate.backup_lsn,
+            peer_horizon_lsn: oldstate.peer_horizon_lsn,
+            remote_consistent_lsn: oldstate.remote_consistent_lsn,
+            peers: oldstate.peers,
+            partial_backup: wal_backup_partial::State::default(),
+        });
     }
+
+    // TODO: persist the file back to the disk after upgrade
+    // TODO: think about backward compatibility and rollbacks
+
     bail!("unsupported safekeeper control file version {}", version)
 }
 
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index ce4b4d7bd0..9b4d4dbb38 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -32,6 +32,7 @@ pub mod send_wal;
 pub mod state;
 pub mod timeline;
 pub mod wal_backup;
+pub mod wal_backup_partial;
 pub mod wal_service;
 pub mod wal_storage;
 
@@ -48,6 +49,7 @@ pub mod defaults {
 
     pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms";
     pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20);
+    pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
 }
 
 #[derive(Debug, Clone)]
@@ -79,6 +81,8 @@ pub struct SafeKeeperConf {
     pub http_auth: Option<Arc<SwappableJwtAuth>>,
     pub current_thread_runtime: bool,
     pub walsenders_keep_horizon: bool,
+    pub partial_backup_enabled: bool,
+    pub partial_backup_timeout: Duration,
 }
 
 impl SafeKeeperConf {
@@ -123,6 +127,8 @@ impl SafeKeeperConf {
             max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
             current_thread_runtime: false,
             walsenders_keep_horizon: false,
+            partial_backup_enabled: false,
+            partial_backup_timeout: Duration::from_secs(0),
         }
     }
 }
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index e541527b6a..28ae042bb3 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -147,6 +147,21 @@ pub static RECEIVED_PS_FEEDBACKS: Lazy<IntCounter> = Lazy::new(|| {
     )
     .expect("Failed to register safekeeper_received_ps_feedbacks_total counter")
 });
+pub static PARTIAL_BACKUP_UPLOADS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "safekeeper_partial_backup_uploads_total",
+        "Number of partial backup uploads to the S3",
+        &["result"]
+    )
+    .expect("Failed to register safekeeper_partial_backup_uploads_total counter")
+});
+pub static PARTIAL_BACKUP_UPLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_partial_backup_uploaded_bytes_total",
+        "Number of bytes uploaded to the S3 during partial backup"
+    )
+    .expect("Failed to register safekeeper_partial_backup_uploaded_bytes_total counter")
+});
 
 pub const LABEL_UNKNOWN: &str = "unknown";
 
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index d7c8fa6955..f2ee0403eb 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -1221,6 +1221,7 @@ mod tests {
                     commit_lsn: Lsn(1234567600),
                 },
             )]),
+            partial_backup: crate::wal_backup_partial::State::default(),
         };
 
         let ser = state.ser().unwrap();
@@ -1266,6 +1267,8 @@ mod tests {
             0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
             0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
             0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
+            // partial_backup
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         ];
 
         assert_eq!(Hex(&ser), Hex(&expected));
diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index 82f7954051..be5e516296 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -13,6 +13,7 @@ use utils::{
 use crate::{
     control_file,
     safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory},
+    wal_backup_partial::{self},
 };
 
 /// Persistent information stored on safekeeper node about timeline.
@@ -54,11 +55,14 @@ pub struct TimelinePersistentState {
     /// pushed to s3. We don't remove WAL beyond it. Persisted only for
     /// informational purposes, we receive it from pageserver (or broker).
     pub remote_consistent_lsn: Lsn,
-    // Peers and their state as we remember it. Knowing peers themselves is
-    // fundamental; but state is saved here only for informational purposes and
-    // obviously can be stale. (Currently not saved at all, but let's provision
-    // place to have less file version upgrades).
+    /// Peers and their state as we remember it. Knowing peers themselves is
+    /// fundamental; but state is saved here only for informational purposes and
+    /// obviously can be stale. (Currently not saved at all, but let's provision
+    /// place to have less file version upgrades).
     pub peers: PersistedPeers,
+    /// Holds names of partial segments uploaded to remote storage. Used to
+    /// clean up old objects without leaving garbage in remote storage.
+    pub partial_backup: wal_backup_partial::State,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
@@ -93,6 +97,7 @@ impl TimelinePersistentState {
                     .map(|p| (*p, PersistedPeerInfo::new()))
                     .collect(),
             ),
+            partial_backup: wal_backup_partial::State::default(),
         }
     }
 
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 4901b86acf..64f764f191 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -38,7 +38,7 @@ use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
 
 use crate::metrics::FullTimelineInfo;
 use crate::wal_storage::Storage as wal_storage_iface;
-use crate::{debug_dump, wal_storage};
+use crate::{debug_dump, wal_backup_partial, wal_storage};
 use crate::{GlobalTimelines, SafeKeeperConf};
 
 /// Things safekeeper should know about timeline state on peers.
@@ -503,6 +503,9 @@ impl Timeline {
         if conf.peer_recovery_enabled {
             tokio::spawn(recovery_main(self.clone(), conf.clone()));
         }
+        if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
+            tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone()));
+        }
     }
 
     /// Delete timeline from disk completely, by removing timeline directory.
@@ -667,8 +670,8 @@ impl Timeline {
             term_flush_lsn =
                 TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn()));
         }
-        self.commit_lsn_watch_tx.send(commit_lsn)?;
         self.term_flush_lsn_watch_tx.send(term_flush_lsn)?;
+        self.commit_lsn_watch_tx.send(commit_lsn)?;
         Ok(rmsg)
     }
 
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 944d80f777..e3f6a606a0 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -18,7 +18,7 @@ use std::time::Duration;
 use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::{XLogSegNo, PG_TLI};
-use remote_storage::{GenericRemoteStorage, RemotePath};
+use remote_storage::{GenericRemoteStorage, RemotePath, StorageMetadata};
 use tokio::fs::File;
 
 use tokio::select;
@@ -180,6 +180,16 @@ fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
         .unwrap()
 }
 
+pub fn init_remote_storage(conf: &SafeKeeperConf) {
+    // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide
+    // dependencies to all tasks instead.
+    REMOTE_STORAGE.get_or_init(|| {
+        conf.remote_storage
+            .as_ref()
+            .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
+    });
+}
+
 const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
 
 /// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
@@ -194,14 +204,6 @@ pub async fn wal_backup_launcher_task_main(
         conf.remote_storage
     );
 
-    let conf_ = conf.clone();
-    REMOTE_STORAGE.get_or_init(|| {
-        conf_
-            .remote_storage
-            .as_ref()
-            .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
-    });
-
     // Presence in this map means launcher is aware s3 offloading is needed for
     // the timeline, but task is started only if it makes sense for to offload
     // from this safekeeper.
@@ -518,6 +520,35 @@ async fn backup_object(
         .await
 }
 
+pub(crate) async fn backup_partial_segment(
+    source_file: &Utf8Path,
+    target_file: &RemotePath,
+    size: usize,
+) -> Result<()> {
+    let storage = get_configured_remote_storage();
+
+    let file = File::open(&source_file)
+        .await
+        .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?;
+
+    // limiting the file to read only the first `size` bytes
+    let limited_file = tokio::io::AsyncReadExt::take(file, size as u64);
+
+    let file = tokio_util::io::ReaderStream::with_capacity(limited_file, BUFFER_SIZE);
+
+    let cancel = CancellationToken::new();
+
+    storage
+        .upload(
+            file,
+            size,
+            target_file,
+            Some(StorageMetadata::from([("sk_type", "partial_segment")])),
+            &cancel,
+        )
+        .await
+}
+
 pub async fn read_object(
     file_path: &RemotePath,
     offset: u64,
@@ -604,6 +635,13 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
     Ok(())
 }
 
+/// Used by wal_backup_partial.
+pub async fn delete_objects(paths: &[RemotePath]) -> Result<()> {
+    let cancel = CancellationToken::new(); // not really used
+    let storage = get_configured_remote_storage();
+    storage.delete_objects(paths, &cancel).await
+}
+
 /// Copy segments from one timeline to another. Used in copy_timeline.
 pub async fn copy_s3_segments(
     wal_seg_size: usize,
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
new file mode 100644
index 0000000000..200096ac5c
--- /dev/null
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -0,0 +1,407 @@
+//! Safekeeper timeline has a background task which is subscribed to `commit_lsn`
+//! and `flush_lsn` updates. After the partial segment was updated (`flush_lsn`
+//! was changed), the segment will be uploaded to S3 in about 15 minutes.
+//!
+//! The filename format for partial segments is
+//! `Segment_Term_Flush_Commit_skNN.partial`, where:
+//! - `Segment` – the segment name, like `000000010000000000000001`
+//! - `Term` – current term
+//! - `Flush` – flush_lsn in hex format `{:016X}`, e.g. `00000000346BC568`
+//! - `Commit` – commit_lsn in the same hex format
+//! - `NN` – safekeeper_id, like `1`
+//!
+//! The full object name example:
+//! `000000010000000000000002_2_0000000002534868_0000000002534410_sk1.partial`
+//!
+//! Each safekeeper will keep info about remote partial segments in its control
+//! file. Code updates state in the control file before doing any S3 operations.
+//! This way control file stores information about all potentially existing
+//! remote partial segments and can clean them up after uploading a newer version.
+
+use std::sync::Arc;
+
+use camino::Utf8PathBuf;
+use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
+use rand::Rng;
+use remote_storage::RemotePath;
+use serde::{Deserialize, Serialize};
+
+use tracing::{debug, error, info, instrument};
+use utils::lsn::Lsn;
+
+use crate::{
+    metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
+    safekeeper::Term,
+    timeline::Timeline,
+    wal_backup, SafeKeeperConf,
+};
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub enum UploadStatus {
+    /// Upload is in progress
+    InProgress,
+    /// Upload is finished
+    Uploaded,
+    /// Deletion is in progress
+    Deleting,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct PartialRemoteSegment {
+    pub status: UploadStatus,
+    pub name: String,
+    pub commit_lsn: Lsn,
+    pub flush_lsn: Lsn,
+    pub term: Term,
+}
+
+impl PartialRemoteSegment {
+    fn eq_without_status(&self, other: &Self) -> bool {
+        self.name == other.name
+            && self.commit_lsn == other.commit_lsn
+            && self.flush_lsn == other.flush_lsn
+            && self.term == other.term
+    }
+}
+
+// NB: these structures are a part of a control_file, you can't change them without
+// changing the control file format version.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
+pub struct State {
+    pub segments: Vec<PartialRemoteSegment>,
+}
+
+impl State {
+    /// Find an Uploaded segment. There should be only one Uploaded segment at a time.
+    fn uploaded_segment(&self) -> Option<PartialRemoteSegment> {
+        self.segments
+            .iter()
+            .find(|seg| seg.status == UploadStatus::Uploaded)
+            .cloned()
+    }
+}
+
+struct PartialBackup {
+    wal_seg_size: usize,
+    tli: Arc<Timeline>,
+    conf: SafeKeeperConf,
+    local_prefix: Utf8PathBuf,
+    remote_prefix: Utf8PathBuf,
+
+    state: State,
+}
+
+// Read-only methods for getting segment names
+impl PartialBackup {
+    fn segno(&self, lsn: Lsn) -> XLogSegNo {
+        lsn.segment_number(self.wal_seg_size)
+    }
+
+    fn segment_name(&self, segno: u64) -> String {
+        XLogFileName(PG_TLI, segno, self.wal_seg_size)
+    }
+
+    fn remote_segment_name(
+        &self,
+        segno: u64,
+        term: u64,
+        commit_lsn: Lsn,
+        flush_lsn: Lsn,
+    ) -> String {
+        format!(
+            "{}_{}_{:016X}_{:016X}_sk{}.partial",
+            self.segment_name(segno),
+            term,
+            flush_lsn.0,
+            commit_lsn.0,
+            self.conf.my_id.0,
+        )
+    }
+
+    fn local_segment_name(&self, segno: u64) -> String {
+        format!("{}.partial", self.segment_name(segno))
+    }
+}
+
+impl PartialBackup {
+    /// Takes a lock to read actual safekeeper state and returns a segment that should be uploaded.
+    async fn prepare_upload(&self) -> PartialRemoteSegment {
+        // this operation takes a lock to get the actual state
+        let sk_info = self.tli.get_safekeeper_info(&self.conf).await;
+        let flush_lsn = Lsn(sk_info.flush_lsn);
+        let commit_lsn = Lsn(sk_info.commit_lsn);
+        let term = sk_info.term;
+        let segno = self.segno(flush_lsn);
+
+        let name = self.remote_segment_name(segno, term, commit_lsn, flush_lsn);
+
+        PartialRemoteSegment {
+            status: UploadStatus::InProgress,
+            name,
+            commit_lsn,
+            flush_lsn,
+            term,
+        }
+    }
+
+    /// Reads segment from disk and uploads it to the remote storage.
+    async fn upload_segment(&mut self, prepared: PartialRemoteSegment) -> anyhow::Result<()> {
+        let flush_lsn = prepared.flush_lsn;
+        let segno = self.segno(flush_lsn);
+
+        // We're going to backup bytes from the start of the segment up to flush_lsn.
+        let backup_bytes = flush_lsn.segment_offset(self.wal_seg_size);
+
+        let local_path = self.local_prefix.join(self.local_segment_name(segno));
+        let remote_path = RemotePath::new(self.remote_prefix.join(&prepared.name).as_ref())?;
+
+        // Upload first `backup_bytes` bytes of the segment to the remote storage.
+        wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?;
+        PARTIAL_BACKUP_UPLOADED_BYTES.inc_by(backup_bytes as u64);
+
+        // We uploaded the segment, now let's verify that the data is still actual.
+        // If the term changed, we cannot guarantee the validity of the uploaded data.
+        // If the term is the same, we know the data is not corrupted.
+        let sk_info = self.tli.get_safekeeper_info(&self.conf).await;
+        if sk_info.term != prepared.term {
+            anyhow::bail!("term changed during upload");
+        }
+        assert!(prepared.commit_lsn <= Lsn(sk_info.commit_lsn));
+        assert!(prepared.flush_lsn <= Lsn(sk_info.flush_lsn));
+
+        Ok(())
+    }
+
+    /// Write new state to disk. If in-memory and on-disk states diverged, returns an error.
+    async fn commit_state(&mut self, new_state: State) -> anyhow::Result<()> {
+        self.tli
+            .map_control_file(|cf| {
+                if cf.partial_backup != self.state {
+                    let memory = self.state.clone();
+                    self.state = cf.partial_backup.clone();
+                    anyhow::bail!(
+                        "partial backup state diverged, memory={:?}, disk={:?}",
+                        memory,
+                        cf.partial_backup
+                    );
+                }
+
+                cf.partial_backup = new_state.clone();
+                Ok(())
+            })
+            .await?;
+        // update in-memory state
+        self.state = new_state;
+        Ok(())
+    }
+
+    /// Upload the latest version of the partial segment and garbage collect older versions.
+    #[instrument(name = "upload", skip_all, fields(name = %prepared.name))]
+    async fn do_upload(&mut self, prepared: &PartialRemoteSegment) -> anyhow::Result<()> {
+        info!("starting upload {:?}", prepared);
+
+        let state_0 = self.state.clone();
+        let state_1 = {
+            let mut state = state_0.clone();
+            state.segments.push(prepared.clone());
+            state
+        };
+
+        // we're going to upload a new segment, let's write it to disk to make GC later
+        self.commit_state(state_1).await?;
+
+        self.upload_segment(prepared.clone()).await?;
+
+        let state_2 = {
+            let mut state = state_0.clone();
+            for seg in state.segments.iter_mut() {
+                seg.status = UploadStatus::Deleting;
+            }
+            let mut actual_remote_segment = prepared.clone();
+            actual_remote_segment.status = UploadStatus::Uploaded;
+            state.segments.push(actual_remote_segment);
+            state
+        };
+
+        // we've uploaded new segment, it's actual, all other segments should be GCed
+        self.commit_state(state_2).await?;
+        self.gc().await?;
+
+        Ok(())
+    }
+
+    /// Delete all non-Uploaded segments from the remote storage. There should be only one
+    /// Uploaded segment at a time.
+    #[instrument(name = "gc", skip_all)]
+    async fn gc(&mut self) -> anyhow::Result<()> {
+        let mut segments_to_delete = vec![];
+
+        let new_segments: Vec<PartialRemoteSegment> = self
+            .state
+            .segments
+            .iter()
+            .filter_map(|seg| {
+                if seg.status == UploadStatus::Uploaded {
+                    Some(seg.clone())
+                } else {
+                    segments_to_delete.push(seg.name.clone());
+                    None
+                }
+            })
+            .collect();
+
+        info!("deleting objects: {:?}", segments_to_delete);
+        let mut objects_to_delete = vec![];
+        for seg in segments_to_delete.iter() {
+            let remote_path = RemotePath::new(self.remote_prefix.join(seg).as_ref())?;
+            objects_to_delete.push(remote_path);
+        }
+
+        // removing segments from remote storage
+        wal_backup::delete_objects(&objects_to_delete).await?;
+
+        // now we can update the state on disk
+        let new_state = {
+            let mut state = self.state.clone();
+            state.segments = new_segments;
+            state
+        };
+        self.commit_state(new_state).await?;
+
+        Ok(())
+    }
+}
+
+#[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))]
+pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
+    debug!("started");
+    let await_duration = conf.partial_backup_timeout;
+
+    let mut cancellation_rx = match tli.get_cancellation_rx() {
+        Ok(rx) => rx,
+        Err(_) => {
+            info!("timeline canceled during task start");
+            return;
+        }
+    };
+
+    // sleep for random time to avoid thundering herd
+    {
+        let randf64 = rand::thread_rng().gen_range(0.0..1.0);
+        let sleep_duration = await_duration.mul_f64(randf64);
+        tokio::time::sleep(sleep_duration).await;
+    }
+
+    let (_, persistent_state) = tli.get_state().await;
+    let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
+    let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();
+    let wal_seg_size = tli.get_wal_seg_size().await;
+
+    let local_prefix = tli.timeline_dir.clone();
+    let remote_prefix = match tli.timeline_dir.strip_prefix(&conf.workdir) {
+        Ok(path) => path.to_owned(),
+        Err(e) => {
+            error!("failed to strip workspace dir prefix: {:?}", e);
+            return;
+        }
+    };
+
+    let mut backup = PartialBackup {
+        wal_seg_size,
+        tli,
+        state: persistent_state.partial_backup,
+        conf,
+        local_prefix,
+        remote_prefix,
+    };
+
+    debug!("state: {:?}", backup.state);
+
+    'outer: loop {
+        // wait until we have something to upload
+        let uploaded_segment = backup.state.uploaded_segment();
+        if let Some(seg) = &uploaded_segment {
+            // if we already uploaded something, wait until we have something new
+            while flush_lsn_rx.borrow().lsn == seg.flush_lsn
+                && *commit_lsn_rx.borrow() == seg.commit_lsn
+                && flush_lsn_rx.borrow().term == seg.term
+            {
+                tokio::select! {
+                    _ = cancellation_rx.changed() => {
+                        info!("timeline canceled");
+                        return;
+                    }
+                    _ = commit_lsn_rx.changed() => {}
+                    _ = flush_lsn_rx.changed() => {}
+                }
+            }
+        }
+
+        // if we don't have any data and zero LSNs, wait for something
+        while flush_lsn_rx.borrow().lsn == Lsn(0) {
+            tokio::select! {
+                _ = cancellation_rx.changed() => {
+                    info!("timeline canceled");
+                    return;
+                }
+                _ = flush_lsn_rx.changed() => {}
+            }
+        }
+
+        // fixing the segno and waiting some time to prevent reuploading the same segment too often
+        let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn);
+        let timeout = tokio::time::sleep(await_duration);
+        tokio::pin!(timeout);
+        let mut timeout_expired = false;
+
+        // waiting until timeout expires OR segno changes
+        'inner: loop {
+            tokio::select! {
+                _ = cancellation_rx.changed() => {
+                    info!("timeline canceled");
+                    return;
+                }
+                _ = commit_lsn_rx.changed() => {}
+                _ = flush_lsn_rx.changed() => {
+                    let segno = backup.segno(flush_lsn_rx.borrow().lsn);
+                    if segno != pending_segno {
+                        // previous segment is no longer partial, aborting the wait
+                        break 'inner;
+                    }
+                }
+                _ = &mut timeout => {
+                    // timeout expired, now we are ready for upload
+                    timeout_expired = true;
+                    break 'inner;
+                }
+            }
+        }
+
+        if !timeout_expired {
+            // likely segno has changed, let's try again in the next iteration
+            continue 'outer;
+        }
+
+        let prepared = backup.prepare_upload().await;
+        if let Some(seg) = &uploaded_segment {
+            if seg.eq_without_status(&prepared) {
+                // we already uploaded this segment, nothing to do
+                continue 'outer;
+            }
+        }
+
+        match backup.do_upload(&prepared).await {
+            Ok(()) => {
+                debug!(
+                    "uploaded {} up to flush_lsn {}",
+                    prepared.name, prepared.flush_lsn
+                );
+                PARTIAL_BACKUP_UPLOADS.with_label_values(&["ok"]).inc();
+            }
+            Err(e) => {
+                info!("failed to upload {}: {:#}", prepared.name, e);
+                PARTIAL_BACKUP_UPLOADS.with_label_values(&["error"]).inc();
+            }
+        }
+    }
+}
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index e3aaf5d391..bc21c4d765 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -176,6 +176,8 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         http_auth: None,
         current_thread_runtime: false,
         walsenders_keep_horizon: false,
+        partial_backup_enabled: false,
+        partial_backup_timeout: Duration::from_secs(0),
     };
 
     let mut global = GlobalMap::new(disk, conf.clone())?;
diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py
index 980f343047..84b69cb36a 100755
--- a/scripts/export_import_between_pageservers.py
+++ b/scripts/export_import_between_pageservers.py
@@ -64,14 +64,14 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
     Returns basepath for files with captured output.
     """
     assert isinstance(cmd, list)
-    base = os.path.basename(cmd[0]) + "_{}".format(global_counter())
+    base = f"{os.path.basename(cmd[0])}_{global_counter()}"
     basepath = os.path.join(capture_dir, base)
     stdout_filename = basepath + ".stdout"
     stderr_filename = basepath + ".stderr"
 
     with open(stdout_filename, "w") as stdout_f:
         with open(stderr_filename, "w") as stderr_f:
-            print('(capturing output to "{}.stdout")'.format(base))
+            print(f'(capturing output to "{base}.stdout")')
             subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
 
     return basepath
@@ -82,11 +82,9 @@ class PgBin:
 
     def __init__(self, log_dir: Path, pg_distrib_dir, pg_version):
         self.log_dir = log_dir
-        self.pg_bin_path = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "bin")
+        self.pg_bin_path = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "bin")
         self.env = os.environ.copy()
-        self.env["LD_LIBRARY_PATH"] = os.path.join(
-            str(pg_distrib_dir), "v{}".format(pg_version), "lib"
-        )
+        self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "lib")
 
     def _fixpath(self, command: List[str]):
         if "/" not in command[0]:
@@ -110,7 +108,7 @@ class PgBin:
         """
 
         self._fixpath(command)
-        print('Running command "{}"'.format(" ".join(command)))
+        print(f'Running command "{" ".join(command)}"')
         env = self._build_env(env)
         subprocess.run(command, env=env, cwd=cwd, check=True)
 
@@ -128,7 +126,7 @@ class PgBin:
         """
 
         self._fixpath(command)
-        print('Running command "{}"'.format(" ".join(command)))
+        print(f'Running command "{" ".join(command)}"')
         env = self._build_env(env)
         return subprocess_capture(
             str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs
@@ -300,7 +298,7 @@ class NeonPageserverHttpClient(requests.Session):
 
 def lsn_to_hex(num: int) -> str:
     """Convert lsn from int to standard hex notation."""
-    return "{:X}/{:X}".format(num >> 32, num & 0xFFFFFFFF)
+    return f"{num >> 32:X}/{num & 0xFFFFFFFF:X}"
 
 
 def lsn_from_hex(lsn_hex: str) -> int:
@@ -331,16 +329,12 @@ def wait_for_upload(
         if current_lsn >= lsn:
             return
         print(
-            "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format(
-                lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1
-            )
+            f"waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, now {lsn_to_hex(current_lsn)}, iteration {i + 1}"
         )
         time.sleep(1)
 
     raise Exception(
-        "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format(
-            lsn_to_hex(lsn), lsn_to_hex(current_lsn)
-        )
+        f"timed out while waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, was {lsn_to_hex(current_lsn)}"
     )
 
 
diff --git a/control_plane/attachment_service/Cargo.toml b/storage_controller/Cargo.toml
similarity index 81%
rename from control_plane/attachment_service/Cargo.toml
rename to storage_controller/Cargo.toml
index 0201e0ed86..165cafaf4e 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "attachment_service"
+name = "storage_controller"
 version = "0.1.0"
 edition.workspace = true
 license.workspace = true
@@ -25,6 +25,7 @@ git-version.workspace = true
 hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
+itertools.workspace = true
 lasso.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
@@ -44,8 +45,8 @@ diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
 r2d2 = { version = "0.8.10" }
 
-utils = { path = "../../libs/utils/" }
-metrics = { path = "../../libs/metrics/" }
-control_plane = { path = ".." }
-workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+utils = { path = "../libs/utils/" }
+metrics = { path = "../libs/metrics/" }
+control_plane = { path = "../control_plane" }
+workspace_hack = { version = "0.1", path = "../workspace_hack" }
 
diff --git a/control_plane/attachment_service/migrations/.keep b/storage_controller/migrations/.keep
similarity index 100%
rename from control_plane/attachment_service/migrations/.keep
rename to storage_controller/migrations/.keep
diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql b/storage_controller/migrations/00000000000000_diesel_initial_setup/down.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
rename to storage_controller/migrations/00000000000000_diesel_initial_setup/down.sql
diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql b/storage_controller/migrations/00000000000000_diesel_initial_setup/up.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
rename to storage_controller/migrations/00000000000000_diesel_initial_setup/up.sql
diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql b/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/down.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
rename to storage_controller/migrations/2024-01-07-211257_create_tenant_shards/down.sql
diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql b/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/up.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
rename to storage_controller/migrations/2024-01-07-211257_create_tenant_shards/up.sql
diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql b/storage_controller/migrations/2024-01-07-212945_create_nodes/down.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
rename to storage_controller/migrations/2024-01-07-212945_create_nodes/down.sql
diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql b/storage_controller/migrations/2024-01-07-212945_create_nodes/up.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
rename to storage_controller/migrations/2024-01-07-212945_create_nodes/up.sql
diff --git a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql b/storage_controller/migrations/2024-02-29-094122_generations_null/down.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
rename to storage_controller/migrations/2024-02-29-094122_generations_null/down.sql
diff --git a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql b/storage_controller/migrations/2024-02-29-094122_generations_null/up.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
rename to storage_controller/migrations/2024-02-29-094122_generations_null/up.sql
diff --git a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql b/storage_controller/migrations/2024-03-18-184429_rename_policy/down.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
rename to storage_controller/migrations/2024-03-18-184429_rename_policy/down.sql
diff --git a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql b/storage_controller/migrations/2024-03-18-184429_rename_policy/up.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql
rename to storage_controller/migrations/2024-03-18-184429_rename_policy/up.sql
diff --git a/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/down.sql b/storage_controller/migrations/2024-03-27-133204_tenant_policies/down.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/down.sql
rename to storage_controller/migrations/2024-03-27-133204_tenant_policies/down.sql
diff --git a/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/up.sql b/storage_controller/migrations/2024-03-27-133204_tenant_policies/up.sql
similarity index 100%
rename from control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/up.sql
rename to storage_controller/migrations/2024-03-27-133204_tenant_policies/up.sql
diff --git a/control_plane/attachment_service/src/auth.rs b/storage_controller/src/auth.rs
similarity index 100%
rename from control_plane/attachment_service/src/auth.rs
rename to storage_controller/src/auth.rs
diff --git a/control_plane/attachment_service/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
similarity index 61%
rename from control_plane/attachment_service/src/compute_hook.rs
rename to storage_controller/src/compute_hook.rs
index 1a8dc6b86d..eb0c4472e4 100644
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -1,3 +1,4 @@
+use std::sync::Arc;
 use std::{collections::HashMap, time::Duration};
 
 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
@@ -18,14 +19,26 @@ const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
 
 pub(crate) const API_CONCURRENCY: usize = 32;
 
+struct UnshardedComputeHookTenant {
+    // Which node is this tenant attached to
+    node_id: NodeId,
+
+    // Must hold this lock to send a notification.
+    send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
+}
 struct ShardedComputeHookTenant {
     stripe_size: ShardStripeSize,
     shard_count: ShardCount,
     shards: Vec<(ShardNumber, NodeId)>,
+
+    // Must hold this lock to send a notification.  The contents represent
+    // the last successfully sent notification, and are used to coalesce multiple
+    // updates by only sending when there is a chance since our last successful send.
+    send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
 }
 
 enum ComputeHookTenant {
-    Unsharded(NodeId),
+    Unsharded(UnshardedComputeHookTenant),
     Sharded(ShardedComputeHookTenant),
 }
 
@@ -37,9 +50,20 @@ impl ComputeHookTenant {
                 shards: vec![(tenant_shard_id.shard_number, node_id)],
                 stripe_size,
                 shard_count: tenant_shard_id.shard_count,
+                send_lock: Arc::default(),
             })
         } else {
-            Self::Unsharded(node_id)
+            Self::Unsharded(UnshardedComputeHookTenant {
+                node_id,
+                send_lock: Arc::default(),
+            })
+        }
+    }
+
+    fn get_send_lock(&self) -> &Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>> {
+        match self {
+            Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock,
+            Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock,
         }
     }
 
@@ -52,8 +76,8 @@ impl ComputeHookTenant {
         node_id: NodeId,
     ) {
         match self {
-            Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
-                *existing_node_id = node_id
+            Self::Unsharded(unsharded_tenant) if tenant_shard_id.shard_count.count() == 1 => {
+                unsharded_tenant.node_id = node_id
             }
             Self::Sharded(sharded_tenant)
                 if sharded_tenant.stripe_size == stripe_size
@@ -80,14 +104,14 @@ impl ComputeHookTenant {
     }
 }
 
-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
 struct ComputeHookNotifyRequestShard {
     node_id: NodeId,
     shard_number: ShardNumber,
 }
 
 /// Request body that we send to the control plane to notify it of where a tenant is attached
-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
 struct ComputeHookNotifyRequest {
     tenant_id: TenantId,
     stripe_size: Option<ShardStripeSize>,
@@ -120,14 +144,44 @@ pub(crate) enum NotifyError {
     Fatal(StatusCode),
 }
 
+enum MaybeSendResult {
+    // Please send this request while holding the lock, and if you succeed then write
+    // the request into the lock.
+    Transmit(
+        (
+            ComputeHookNotifyRequest,
+            tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>,
+        ),
+    ),
+    // Something requires sending, but you must wait for a current sender then call again
+    AwaitLock(Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>),
+    // Nothing requires sending
+    Noop,
+}
+
 impl ComputeHookTenant {
-    fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
-        match self {
-            Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
+    fn maybe_send(
+        &self,
+        tenant_id: TenantId,
+        lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>>,
+    ) -> MaybeSendResult {
+        let locked = match lock {
+            Some(already_locked) => already_locked,
+            None => {
+                // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::state`] lock.
+                let Ok(locked) = self.get_send_lock().clone().try_lock_owned() else {
+                    return MaybeSendResult::AwaitLock(self.get_send_lock().clone());
+                };
+                locked
+            }
+        };
+
+        let request = match self {
+            Self::Unsharded(unsharded_tenant) => Some(ComputeHookNotifyRequest {
                 tenant_id,
                 shards: vec![ComputeHookNotifyRequestShard {
                     shard_number: ShardNumber(0),
-                    node_id: *node_id,
+                    node_id: unsharded_tenant.node_id,
                 }],
                 stripe_size: None,
             }),
@@ -151,12 +205,25 @@ impl ComputeHookTenant {
                 // Sharded tenant doesn't yet have information for all its shards
 
                 tracing::info!(
-                    "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
+                    "ComputeHookTenant::maybe_send: not enough shards ({}/{})",
                     sharded_tenant.shards.len(),
                     sharded_tenant.shard_count.count()
                 );
                 None
             }
+        };
+
+        match request {
+            None => {
+                // Not yet ready to emit a notification
+                tracing::info!("Tenant isn't yet ready to emit a notification");
+                MaybeSendResult::Noop
+            }
+            Some(request) if Some(&request) == locked.as_ref() => {
+                // No change from the last value successfully sent
+                MaybeSendResult::Noop
+            }
+            Some(request) => MaybeSendResult::Transmit((request, locked)),
         }
     }
 }
@@ -166,8 +233,15 @@ impl ComputeHookTenant {
 /// the compute connection string.
 pub(super) struct ComputeHook {
     config: Config,
-    state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
+    state: std::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
     authorization_header: Option<String>,
+
+    // Concurrency limiter, so that we do not overload the cloud control plane when updating
+    // large numbers of tenants (e.g. when failing over after a node failure)
+    api_concurrency: tokio::sync::Semaphore,
+
+    // This lock is only used in testing enviroments, to serialize calls into neon_lock
+    neon_local_lock: tokio::sync::Mutex<()>,
 }
 
 impl ComputeHook {
@@ -181,14 +255,20 @@ impl ComputeHook {
             state: Default::default(),
             config,
             authorization_header,
+            neon_local_lock: Default::default(),
+            api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY),
         }
     }
 
     /// For test environments: use neon_local's LocalEnv to update compute
     async fn do_notify_local(
         &self,
-        reconfigure_request: ComputeHookNotifyRequest,
+        reconfigure_request: &ComputeHookNotifyRequest,
     ) -> anyhow::Result<()> {
+        // neon_local updates are not safe to call concurrently, use a lock to serialize
+        // all calls to this function
+        let _locked = self.neon_local_lock.lock().await;
+
         let env = match LocalEnv::load_config() {
             Ok(e) => e,
             Err(e) => {
@@ -205,7 +285,7 @@ impl ComputeHook {
         } = reconfigure_request;
 
         let compute_pageservers = shards
-            .into_iter()
+            .iter()
             .map(|shard| {
                 let ps_conf = env
                     .get_pageserver_conf(shard.node_id)
@@ -217,10 +297,10 @@ impl ComputeHook {
             .collect::<Vec<_>>();
 
         for (endpoint_name, endpoint) in &cplane.endpoints {
-            if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
+            if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running {
                 tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
                 endpoint
-                    .reconfigure(compute_pageservers.clone(), stripe_size)
+                    .reconfigure(compute_pageservers.clone(), *stripe_size)
                     .await?;
             }
         }
@@ -298,12 +378,23 @@ impl ComputeHook {
     async fn do_notify(
         &self,
         url: &String,
-        reconfigure_request: ComputeHookNotifyRequest,
+        reconfigure_request: &ComputeHookNotifyRequest,
         cancel: &CancellationToken,
     ) -> Result<(), NotifyError> {
         let client = reqwest::Client::new();
+
+        // We hold these semaphore units across all retries, rather than only across each
+        // HTTP request: this is to preserve fairness and avoid a situation where a retry might
+        // time out waiting for a semaphore.
+        let _units = self
+            .api_concurrency
+            .acquire()
+            .await
+            // Interpret closed semaphore as shutdown
+            .map_err(|_| NotifyError::ShuttingDown)?;
+
         backoff::retry(
-            || self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
+            || self.do_notify_iteration(&client, url, reconfigure_request, cancel),
             |e| {
                 matches!(
                     e,
@@ -343,42 +434,70 @@ impl ComputeHook {
         stripe_size: ShardStripeSize,
         cancel: &CancellationToken,
     ) -> Result<(), NotifyError> {
-        let mut locked = self.state.lock().await;
+        let maybe_send_result = {
+            let mut state_locked = self.state.lock().unwrap();
 
-        use std::collections::hash_map::Entry;
-        let tenant = match locked.entry(tenant_shard_id.tenant_id) {
-            Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
-                tenant_shard_id,
-                stripe_size,
-                node_id,
-            )),
-            Entry::Occupied(e) => {
-                let tenant = e.into_mut();
-                tenant.update(tenant_shard_id, stripe_size, node_id);
-                tenant
+            use std::collections::hash_map::Entry;
+            let tenant = match state_locked.entry(tenant_shard_id.tenant_id) {
+                Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
+                    tenant_shard_id,
+                    stripe_size,
+                    node_id,
+                )),
+                Entry::Occupied(e) => {
+                    let tenant = e.into_mut();
+                    tenant.update(tenant_shard_id, stripe_size, node_id);
+                    tenant
+                }
+            };
+            tenant.maybe_send(tenant_shard_id.tenant_id, None)
+        };
+
+        // Process result: we may get an update to send, or we may have to wait for a lock
+        // before trying again.
+        let (request, mut send_lock_guard) = match maybe_send_result {
+            MaybeSendResult::Noop => {
+                return Ok(());
             }
+            MaybeSendResult::AwaitLock(send_lock) => {
+                let send_locked = send_lock.lock_owned().await;
+
+                // Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here
+                // we have acquired the send lock and take `[Self::state]` lock.  This is safe because maybe_send only uses
+                // try_lock.
+                let state_locked = self.state.lock().unwrap();
+                let Some(tenant) = state_locked.get(&tenant_shard_id.tenant_id) else {
+                    return Ok(());
+                };
+                match tenant.maybe_send(tenant_shard_id.tenant_id, Some(send_locked)) {
+                    MaybeSendResult::AwaitLock(_) => {
+                        unreachable!("We supplied lock guard")
+                    }
+                    MaybeSendResult::Noop => {
+                        return Ok(());
+                    }
+                    MaybeSendResult::Transmit((request, lock)) => (request, lock),
+                }
+            }
+            MaybeSendResult::Transmit((request, lock)) => (request, lock),
         };
 
-        let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
-        let Some(reconfigure_request) = reconfigure_request else {
-            // The tenant doesn't yet have pageservers for all its shards: we won't notify anything
-            // until it does.
-            tracing::info!("Tenant isn't yet ready to emit a notification");
-            return Ok(());
-        };
-
-        if let Some(notify_url) = &self.config.compute_hook_url {
-            self.do_notify(notify_url, reconfigure_request, cancel)
-                .await
+        let result = if let Some(notify_url) = &self.config.compute_hook_url {
+            self.do_notify(notify_url, &request, cancel).await
         } else {
-            self.do_notify_local(reconfigure_request)
-                .await
-                .map_err(|e| {
-                    // This path is for testing only, so munge the error into our prod-style error type.
-                    tracing::error!("Local notification hook failed: {e}");
-                    NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
-                })
+            self.do_notify_local(&request).await.map_err(|e| {
+                // This path is for testing only, so munge the error into our prod-style error type.
+                tracing::error!("Local notification hook failed: {e}");
+                NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
+            })
+        };
+
+        if result.is_ok() {
+            // Before dropping the send lock, stash the request we just sent so that
+            // subsequent callers can avoid redundantly re-sending the same thing.
+            *send_lock_guard = Some(request);
         }
+        result
     }
 }
 
@@ -402,21 +521,22 @@ pub(crate) mod tests {
             NodeId(1),
         );
 
-        // An unsharded tenant is always ready to emit a notification
-        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
-        assert_eq!(
-            tenant_state
-                .maybe_reconfigure(tenant_id)
-                .unwrap()
-                .shards
-                .len(),
-            1
-        );
-        assert!(tenant_state
-            .maybe_reconfigure(tenant_id)
-            .unwrap()
-            .stripe_size
-            .is_none());
+        // An unsharded tenant is always ready to emit a notification, but won't
+        // send the same one twice
+        let send_result = tenant_state.maybe_send(tenant_id, None);
+        let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
+            anyhow::bail!("Wrong send result");
+        };
+        assert_eq!(request.shards.len(), 1);
+        assert!(request.stripe_size.is_none());
+
+        // Simulate successful send
+        *guard = Some(request);
+        drop(guard);
+
+        // Try asking again: this should be a no-op
+        let send_result = tenant_state.maybe_send(tenant_id, None);
+        assert!(matches!(send_result, MaybeSendResult::Noop));
 
         // Writing the first shard of a multi-sharded situation (i.e. in a split)
         // resets the tenant state and puts it in an non-notifying state (need to
@@ -430,7 +550,10 @@ pub(crate) mod tests {
             ShardStripeSize(32768),
             NodeId(1),
         );
-        assert!(tenant_state.maybe_reconfigure(tenant_id).is_none());
+        assert!(matches!(
+            tenant_state.maybe_send(tenant_id, None),
+            MaybeSendResult::Noop
+        ));
 
         // Writing the second shard makes it ready to notify
         tenant_state.update(
@@ -443,22 +566,16 @@ pub(crate) mod tests {
             NodeId(1),
         );
 
-        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
-        assert_eq!(
-            tenant_state
-                .maybe_reconfigure(tenant_id)
-                .unwrap()
-                .shards
-                .len(),
-            2
-        );
-        assert_eq!(
-            tenant_state
-                .maybe_reconfigure(tenant_id)
-                .unwrap()
-                .stripe_size,
-            Some(ShardStripeSize(32768))
-        );
+        let send_result = tenant_state.maybe_send(tenant_id, None);
+        let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
+            anyhow::bail!("Wrong send result");
+        };
+        assert_eq!(request.shards.len(), 2);
+        assert_eq!(request.stripe_size, Some(ShardStripeSize(32768)));
+
+        // Simulate successful send
+        *guard = Some(request);
+        drop(guard);
 
         Ok(())
     }
diff --git a/control_plane/attachment_service/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
similarity index 100%
rename from control_plane/attachment_service/src/heartbeater.rs
rename to storage_controller/src/heartbeater.rs
diff --git a/control_plane/attachment_service/src/http.rs b/storage_controller/src/http.rs
similarity index 96%
rename from control_plane/attachment_service/src/http.rs
rename to storage_controller/src/http.rs
index 1f3f78bffa..c59bcaa174 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -399,6 +399,15 @@ async fn handle_tenant_describe(
     json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
 }
 
+async fn handle_tenant_list(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    json_response(StatusCode::OK, service.tenant_list())
+}
+
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -412,7 +421,10 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
     check_permissions(&req, Scope::Admin)?;
 
     let state = get_state(&req);
-    json_response(StatusCode::OK, state.service.node_list().await?)
+    let nodes = state.service.node_list().await?;
+    let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
+
+    json_response(StatusCode::OK, api_nodes)
 }
 
 async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -590,9 +602,17 @@ where
     .await
 }
 
+/// Check if the required scope is held in the request's token, or if the request has
+/// a token with 'admin' scope then always permit it.
 fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
     check_permission_with(request, |claims| {
-        crate::auth::check_permission(claims, required_scope)
+        match crate::auth::check_permission(claims, required_scope) {
+            Err(e) => match crate::auth::check_permission(claims, Scope::Admin) {
+                Ok(()) => Ok(()),
+                Err(_) => Err(e),
+            },
+            Ok(()) => Ok(()),
+        }
     })
 }
 
@@ -793,6 +813,9 @@ pub fn make_router(
                 RequestName("control_v1_tenant_describe"),
             )
         })
+        .get("/control/v1/tenant", |r| {
+            tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list"))
+        })
         .put("/control/v1/tenant/:tenant_id/policy", |r| {
             named_request_span(
                 r,
diff --git a/control_plane/attachment_service/src/id_lock_map.rs b/storage_controller/src/id_lock_map.rs
similarity index 100%
rename from control_plane/attachment_service/src/id_lock_map.rs
rename to storage_controller/src/id_lock_map.rs
diff --git a/control_plane/attachment_service/src/lib.rs b/storage_controller/src/lib.rs
similarity index 98%
rename from control_plane/attachment_service/src/lib.rs
rename to storage_controller/src/lib.rs
index 8bcd5c0ac4..2ea490a14b 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -14,7 +14,7 @@ mod reconciler;
 mod scheduler;
 mod schema;
 pub mod service;
-mod tenant_state;
+mod tenant_shard;
 
 #[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
 struct Sequence(u64);
diff --git a/control_plane/attachment_service/src/main.rs b/storage_controller/src/main.rs
similarity index 96%
rename from control_plane/attachment_service/src/main.rs
rename to storage_controller/src/main.rs
index bd8d7f5c59..3c03d6efe8 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -1,18 +1,19 @@
 use anyhow::{anyhow, Context};
-use attachment_service::http::make_router;
-use attachment_service::metrics::preinitialize_metrics;
-use attachment_service::persistence::Persistence;
-use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
 use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
 use std::sync::Arc;
+use storage_controller::http::make_router;
+use storage_controller::metrics::preinitialize_metrics;
+use storage_controller::persistence::Persistence;
+use storage_controller::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};
 
+use utils::sentry_init::init_sentry;
 use utils::{project_build_tag, project_git_version, tcp_listener};
 
 project_git_version!(GIT_VERSION);
@@ -50,7 +51,7 @@ struct Cli {
     #[arg(short, long)]
     path: Option<Utf8PathBuf>,
 
-    /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
+    /// URL to connect to postgres, like postgresql://localhost:1234/storage_controller
     #[arg(long)]
     database_url: Option<String>,
 
@@ -158,6 +159,8 @@ fn main() -> anyhow::Result<()> {
         std::process::exit(1);
     }));
 
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
+
     tokio::runtime::Builder::new_current_thread()
         // We use spawn_blocking for database operations, so require approximately
         // as many blocking threads as we will open database connections.
diff --git a/control_plane/attachment_service/src/metrics.rs b/storage_controller/src/metrics.rs
similarity index 100%
rename from control_plane/attachment_service/src/metrics.rs
rename to storage_controller/src/metrics.rs
diff --git a/control_plane/attachment_service/src/node.rs b/storage_controller/src/node.rs
similarity index 93%
rename from control_plane/attachment_service/src/node.rs
rename to storage_controller/src/node.rs
index df40bff66f..7ba6828deb 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -3,7 +3,8 @@ use std::{str::FromStr, time::Duration};
 use hyper::StatusCode;
 use pageserver_api::{
     controller_api::{
-        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard,
+        NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
+        TenantLocateResponseShard,
     },
     shard::TenantShardId,
 };
@@ -256,6 +257,19 @@ impl Node {
         )
         .await
     }
+
+    /// Generate the simplified API-friendly description of a node's state
+    pub(crate) fn describe(&self) -> NodeDescribeResponse {
+        NodeDescribeResponse {
+            id: self.id,
+            availability: self.availability.into(),
+            scheduling: self.scheduling,
+            listen_http_addr: self.listen_http_addr.clone(),
+            listen_http_port: self.listen_http_port,
+            listen_pg_addr: self.listen_pg_addr.clone(),
+            listen_pg_port: self.listen_pg_port,
+        }
+    }
 }
 
 impl std::fmt::Display for Node {
diff --git a/control_plane/attachment_service/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
similarity index 100%
rename from control_plane/attachment_service/src/pageserver_client.rs
rename to storage_controller/src/pageserver_client.rs
diff --git a/control_plane/attachment_service/src/persistence.rs b/storage_controller/src/persistence.rs
similarity index 99%
rename from control_plane/attachment_service/src/persistence.rs
rename to storage_controller/src/persistence.rs
index d60392bdbc..55fbfd10bc 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -696,7 +696,7 @@ impl Persistence {
     }
 }
 
-/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
+/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
 #[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
 #[diesel(table_name = crate::schema::tenant_shards)]
 pub(crate) struct TenantShardPersistence {
diff --git a/control_plane/attachment_service/src/persistence/split_state.rs b/storage_controller/src/persistence/split_state.rs
similarity index 100%
rename from control_plane/attachment_service/src/persistence/split_state.rs
rename to storage_controller/src/persistence/split_state.rs
diff --git a/control_plane/attachment_service/src/reconciler.rs b/storage_controller/src/reconciler.rs
similarity index 99%
rename from control_plane/attachment_service/src/reconciler.rs
rename to storage_controller/src/reconciler.rs
index 72eb8faccb..49cfaad569 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -18,14 +18,14 @@ use utils::sync::gate::GateGuard;
 
 use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
-use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
+use crate::tenant_shard::{IntentState, ObservedState, ObservedStateLocation};
 
 const DEFAULT_HEATMAP_PERIOD: &str = "60s";
 
 /// Object with the lifetime of the background reconcile task that is created
 /// for tenants which have a difference between their intent and observed states.
 pub(super) struct Reconciler {
-    /// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot
+    /// See [`crate::tenant_shard::TenantShard`] for the meanings of these fields: they are a snapshot
     /// of a tenant's state from when we spawned a reconcile task.
     pub(super) tenant_shard_id: TenantShardId,
     pub(crate) shard: ShardIdentity,
@@ -48,11 +48,11 @@ pub(super) struct Reconciler {
 
     /// To avoid stalling if the cloud control plane is unavailable, we may proceed
     /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed
-    /// so that we can set [`crate::tenant_state::TenantState::pending_compute_notification`] to ensure a later retry.
+    /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry.
     pub(crate) compute_notify_failure: bool,
 
     /// A means to abort background reconciliation: it is essential to
-    /// call this when something changes in the original TenantState that
+    /// call this when something changes in the original TenantShard that
     /// will make this reconciliation impossible or unnecessary, for
     /// example when a pageserver node goes offline, or the PlacementPolicy for
     /// the tenant is changed.
@@ -66,7 +66,7 @@ pub(super) struct Reconciler {
     pub(crate) persistence: Arc<Persistence>,
 }
 
-/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
+/// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any
 /// reference counting for Scheduler.  The IntentState is what the scheduler works with,
 /// and the TargetState is just the instruction for a particular Reconciler run.
 #[derive(Debug)]
diff --git a/control_plane/attachment_service/src/scheduler.rs b/storage_controller/src/scheduler.rs
similarity index 98%
rename from control_plane/attachment_service/src/scheduler.rs
rename to storage_controller/src/scheduler.rs
index 782189d11f..862ac0cbfe 100644
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -1,4 +1,4 @@
-use crate::{node::Node, tenant_state::TenantState};
+use crate::{node::Node, tenant_shard::TenantShard};
 use pageserver_api::controller_api::UtilizationScore;
 use serde::Serialize;
 use std::collections::HashMap;
@@ -27,7 +27,7 @@ pub enum MaySchedule {
 
 #[derive(Serialize)]
 struct SchedulerNode {
-    /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
+    /// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
     shard_count: usize,
 
     /// Whether this node is currently elegible to have new shards scheduled (this is derived
@@ -84,7 +84,7 @@ impl std::ops::Add for AffinityScore {
     }
 }
 
-// For carrying state between multiple calls to [`TenantState::schedule`], e.g. when calling
+// For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
 // it for many shards in the same tenant.
 #[derive(Debug, Default)]
 pub(crate) struct ScheduleContext {
@@ -147,7 +147,7 @@ impl Scheduler {
     pub(crate) fn consistency_check<'a>(
         &self,
         nodes: impl Iterator<Item = &'a Node>,
-        shards: impl Iterator<Item = &'a TenantState>,
+        shards: impl Iterator<Item = &'a TenantShard>,
     ) -> anyhow::Result<()> {
         let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
         for node in nodes {
@@ -398,7 +398,7 @@ pub(crate) mod test_utils {
 mod tests {
     use super::*;
 
-    use crate::tenant_state::IntentState;
+    use crate::tenant_shard::IntentState;
     #[test]
     fn scheduler_basic() -> anyhow::Result<()> {
         let nodes = test_utils::make_test_nodes(2);
diff --git a/control_plane/attachment_service/src/schema.rs b/storage_controller/src/schema.rs
similarity index 100%
rename from control_plane/attachment_service/src/schema.rs
rename to storage_controller/src/schema.rs
diff --git a/control_plane/attachment_service/src/service.rs b/storage_controller/src/service.rs
similarity index 97%
rename from control_plane/attachment_service/src/service.rs
rename to storage_controller/src/service.rs
index 7502d9d186..010558b797 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -20,6 +20,7 @@ use control_plane::storage_controller::{
 use diesel::result::DatabaseErrorKind;
 use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
+use itertools::Itertools;
 use pageserver_api::{
     controller_api::{
         NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
@@ -65,9 +66,9 @@ use crate::{
     persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence},
     reconciler::attached_location_conf,
     scheduler::Scheduler,
-    tenant_state::{
+    tenant_shard::{
         IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
-        ReconcilerWaiter, TenantState,
+        ReconcilerWaiter, TenantShard,
     },
 };
 
@@ -91,7 +92,7 @@ pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
 
 // Top level state available to all HTTP handlers
 struct ServiceState {
-    tenants: BTreeMap<TenantShardId, TenantState>,
+    tenants: BTreeMap<TenantShardId, TenantShard>,
 
     nodes: Arc<HashMap<NodeId, Node>>,
 
@@ -101,7 +102,7 @@ struct ServiceState {
 impl ServiceState {
     fn new(
         nodes: HashMap<NodeId, Node>,
-        tenants: BTreeMap<TenantShardId, TenantState>,
+        tenants: BTreeMap<TenantShardId, TenantShard>,
         scheduler: Scheduler,
     ) -> Self {
         Self {
@@ -115,7 +116,7 @@ impl ServiceState {
         &mut self,
     ) -> (
         &mut Arc<HashMap<NodeId, Node>>,
-        &mut BTreeMap<TenantShardId, TenantState>,
+        &mut BTreeMap<TenantShardId, TenantShard>,
         &mut Scheduler,
     ) {
         (&mut self.nodes, &mut self.tenants, &mut self.scheduler)
@@ -334,11 +335,11 @@ impl Service {
 
             for (tenant_shard_id, shard_observations) in observed {
                 for (node_id, observed_loc) in shard_observations {
-                    let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
+                    let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else {
                         cleanup.push((tenant_shard_id, node_id));
                         continue;
                     };
-                    tenant_state
+                    tenant_shard
                         .observed
                         .locations
                         .insert(node_id, ObservedStateLocation { conf: observed_loc });
@@ -347,14 +348,14 @@ impl Service {
 
             // Populate each tenant's intent state
             let mut schedule_context = ScheduleContext::default();
-            for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
+            for (tenant_shard_id, tenant_shard) in tenants.iter_mut() {
                 if tenant_shard_id.shard_number == ShardNumber(0) {
                     // Reset scheduling context each time we advance to the next Tenant
                     schedule_context = ScheduleContext::default();
                 }
 
-                tenant_state.intent_from_observed(scheduler);
-                if let Err(e) = tenant_state.schedule(scheduler, &mut schedule_context) {
+                tenant_shard.intent_from_observed(scheduler);
+                if let Err(e) = tenant_shard.schedule(scheduler, &mut schedule_context) {
                     // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
                     // not enough pageservers are available.  The tenant may well still be available
                     // to clients.
@@ -363,11 +364,11 @@ impl Service {
                     // If we're both intending and observed to be attached at a particular node, we will
                     // emit a compute notification for this. In the case where our observed state does not
                     // yet match our intent, we will eventually reconcile, and that will emit a compute notification.
-                    if let Some(attached_at) = tenant_state.stably_attached() {
+                    if let Some(attached_at) = tenant_shard.stably_attached() {
                         compute_notifications.push((
                             *tenant_shard_id,
                             attached_at,
-                            tenant_state.shard.stripe_size,
+                            tenant_shard.shard.stripe_size,
                         ));
                     }
                 }
@@ -742,7 +743,7 @@ impl Service {
 
     /// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation
     /// was successful, this will update the observed state of the tenant such that subsequent
-    /// calls to [`TenantState::maybe_reconcile`] will do nothing.
+    /// calls to [`TenantShard::maybe_reconcile`] will do nothing.
     #[instrument(skip_all, fields(
         tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(),
         sequence=%result.sequence
@@ -760,10 +761,10 @@ impl Service {
         tenant.generation = std::cmp::max(tenant.generation, result.generation);
 
         // If the reconciler signals that it failed to notify compute, set this state on
-        // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
+        // the shard so that a future [`TenantShard::maybe_reconcile`] will try again.
         tenant.pending_compute_notification = result.pending_compute_notification;
 
-        // Let the TenantState know it is idle.
+        // Let the TenantShard know it is idle.
         tenant.reconcile_complete(result.sequence);
 
         match result.result {
@@ -978,7 +979,7 @@ impl Service {
             if let Some(generation_pageserver) = tsp.generation_pageserver {
                 intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
             }
-            let new_tenant = TenantState::from_persistent(tsp, intent)?;
+            let new_tenant = TenantShard::from_persistent(tsp, intent)?;
 
             tenants.insert(tenant_shard_id, new_tenant);
         }
@@ -1125,7 +1126,7 @@ impl Service {
                     let mut locked = self.inner.write().unwrap();
                     locked.tenants.insert(
                         attach_req.tenant_shard_id,
-                        TenantState::new(
+                        TenantShard::new(
                             attach_req.tenant_shard_id,
                             ShardIdentity::unsharded(),
                             PlacementPolicy::Attached(0),
@@ -1177,32 +1178,32 @@ impl Service {
         let mut locked = self.inner.write().unwrap();
         let (_nodes, tenants, scheduler) = locked.parts_mut();
 
-        let tenant_state = tenants
+        let tenant_shard = tenants
             .get_mut(&attach_req.tenant_shard_id)
             .expect("Checked for existence above");
 
         if let Some(new_generation) = new_generation {
-            tenant_state.generation = Some(new_generation);
-            tenant_state.policy = PlacementPolicy::Attached(0);
+            tenant_shard.generation = Some(new_generation);
+            tenant_shard.policy = PlacementPolicy::Attached(0);
         } else {
             // This is a detach notification.  We must update placement policy to avoid re-attaching
             // during background scheduling/reconciliation, or during storage controller restart.
             assert!(attach_req.node_id.is_none());
-            tenant_state.policy = PlacementPolicy::Detached;
+            tenant_shard.policy = PlacementPolicy::Detached;
         }
 
         if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
             tracing::info!(
                 tenant_id = %attach_req.tenant_shard_id,
                 ps_id = %attaching_pageserver,
-                generation = ?tenant_state.generation,
+                generation = ?tenant_shard.generation,
                 "issuing",
             );
-        } else if let Some(ps_id) = tenant_state.intent.get_attached() {
+        } else if let Some(ps_id) = tenant_shard.intent.get_attached() {
             tracing::info!(
                 tenant_id = %attach_req.tenant_shard_id,
                 %ps_id,
-                generation = ?tenant_state.generation,
+                generation = ?tenant_shard.generation,
                 "dropping",
             );
         } else {
@@ -1210,14 +1211,14 @@ impl Service {
             tenant_id = %attach_req.tenant_shard_id,
             "no-op: tenant already has no pageserver");
         }
-        tenant_state
+        tenant_shard
             .intent
             .set_attached(scheduler, attach_req.node_id);
 
         tracing::info!(
             "attach_hook: tenant {} set generation {:?}, pageserver {}",
             attach_req.tenant_shard_id,
-            tenant_state.generation,
+            tenant_shard.generation,
             // TODO: this is an odd number of 0xf's
             attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
         );
@@ -1229,36 +1230,36 @@ impl Service {
         #[cfg(feature = "testing")]
         {
             if let Some(node_id) = attach_req.node_id {
-                tenant_state.observed.locations = HashMap::from([(
+                tenant_shard.observed.locations = HashMap::from([(
                     node_id,
                     ObservedStateLocation {
                         conf: Some(attached_location_conf(
-                            tenant_state.generation.unwrap(),
-                            &tenant_state.shard,
-                            &tenant_state.config,
+                            tenant_shard.generation.unwrap(),
+                            &tenant_shard.shard,
+                            &tenant_shard.config,
                             false,
                         )),
                     },
                 )]);
             } else {
-                tenant_state.observed.locations.clear();
+                tenant_shard.observed.locations.clear();
             }
         }
 
         Ok(AttachHookResponse {
             gen: attach_req
                 .node_id
-                .map(|_| tenant_state.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()),
+                .map(|_| tenant_shard.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()),
         })
     }
 
     pub(crate) fn inspect(&self, inspect_req: InspectRequest) -> InspectResponse {
         let locked = self.inner.read().unwrap();
 
-        let tenant_state = locked.tenants.get(&inspect_req.tenant_shard_id);
+        let tenant_shard = locked.tenants.get(&inspect_req.tenant_shard_id);
 
         InspectResponse {
-            attachment: tenant_state.and_then(|s| {
+            attachment: tenant_shard.and_then(|s| {
                 s.intent
                     .get_attached()
                     .map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps))
@@ -1320,11 +1321,11 @@ impl Service {
             let mut locked = self.inner.write().unwrap();
 
             for (tenant_shard_id, observed_loc) in configs.tenant_shards {
-                let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
+                let Some(tenant_shard) = locked.tenants.get_mut(&tenant_shard_id) else {
                     cleanup.push(tenant_shard_id);
                     continue;
                 };
-                tenant_state
+                tenant_shard
                     .observed
                     .locations
                     .insert(node.get_id(), ObservedStateLocation { conf: observed_loc });
@@ -1495,13 +1496,13 @@ impl Service {
         };
 
         for req_tenant in validate_req.tenants {
-            if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
-                let valid = tenant_state.generation == Some(Generation::new(req_tenant.gen));
+            if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) {
+                let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen));
                 tracing::info!(
                     "handle_validate: {}(gen {}): valid={valid} (latest {:?})",
                     req_tenant.id,
                     req_tenant.gen,
-                    tenant_state.generation
+                    tenant_shard.generation
                 );
                 response.tenants.push(ValidateResponseTenant {
                     id: req_tenant.id,
@@ -1687,7 +1688,7 @@ impl Service {
                         continue;
                     }
                     Entry::Vacant(entry) => {
-                        let state = entry.insert(TenantState::new(
+                        let state = entry.insert(TenantShard::new(
                             tenant_shard_id,
                             ShardIdentity::from_params(
                                 tenant_shard_id.shard_number,
@@ -1762,6 +1763,9 @@ impl Service {
 
     /// Part of [`Self::tenant_location_config`]: dissect an incoming location config request,
     /// and transform it into either a tenant creation of a series of shard updates.
+    ///
+    /// If the incoming request makes no changes, a [`TenantCreateOrUpdate::Update`] result will
+    /// still be returned.
     fn tenant_location_config_prepare(
         &self,
         tenant_id: TenantId,
@@ -1809,17 +1813,12 @@ impl Service {
                 _ => None,
             };
 
-            if shard.policy != placement_policy
-                || shard.config != req.config.tenant_conf
-                || set_generation.is_some()
-            {
-                updates.push(ShardUpdate {
-                    tenant_shard_id: *shard_id,
-                    placement_policy: placement_policy.clone(),
-                    tenant_config: req.config.tenant_conf.clone(),
-                    generation: set_generation,
-                });
-            }
+            updates.push(ShardUpdate {
+                tenant_shard_id: *shard_id,
+                placement_policy: placement_policy.clone(),
+                tenant_config: req.config.tenant_conf.clone(),
+                generation: set_generation,
+            });
         }
 
         if create {
@@ -1848,6 +1847,7 @@ impl Service {
                 },
             )
         } else {
+            assert!(!updates.is_empty());
             TenantCreateOrUpdate::Update(updates)
         }
     }
@@ -2735,47 +2735,73 @@ impl Service {
         })
     }
 
-    pub(crate) fn tenant_describe(
+    /// Returns None if the input iterator of shards does not include a shard with number=0
+    fn tenant_describe_impl<'a>(
         &self,
-        tenant_id: TenantId,
-    ) -> Result<TenantDescribeResponse, ApiError> {
-        let locked = self.inner.read().unwrap();
-
+        shards: impl Iterator<Item = &'a TenantShard>,
+    ) -> Option<TenantDescribeResponse> {
         let mut shard_zero = None;
-        let mut shards = Vec::new();
+        let mut describe_shards = Vec::new();
 
-        for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id))
-        {
-            if tenant_shard_id.is_zero() {
+        for shard in shards {
+            if shard.tenant_shard_id.is_zero() {
                 shard_zero = Some(shard);
             }
 
-            let response_shard = TenantDescribeResponseShard {
-                tenant_shard_id: *tenant_shard_id,
+            describe_shards.push(TenantDescribeResponseShard {
+                tenant_shard_id: shard.tenant_shard_id,
                 node_attached: *shard.intent.get_attached(),
                 node_secondary: shard.intent.get_secondary().to_vec(),
                 last_error: shard.last_error.lock().unwrap().clone(),
                 is_reconciling: shard.reconciler.is_some(),
                 is_pending_compute_notification: shard.pending_compute_notification,
                 is_splitting: matches!(shard.splitting, SplitState::Splitting),
-            };
-            shards.push(response_shard);
+                scheduling_policy: *shard.get_scheduling_policy(),
+            })
         }
 
-        let Some(shard_zero) = shard_zero else {
-            return Err(ApiError::NotFound(
-                anyhow::anyhow!("Tenant {tenant_id} not found").into(),
-            ));
-        };
+        let shard_zero = shard_zero?;
 
-        Ok(TenantDescribeResponse {
-            shards,
+        Some(TenantDescribeResponse {
+            tenant_id: shard_zero.tenant_shard_id.tenant_id,
+            shards: describe_shards,
             stripe_size: shard_zero.shard.stripe_size,
             policy: shard_zero.policy.clone(),
             config: shard_zero.config.clone(),
         })
     }
 
+    pub(crate) fn tenant_describe(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<TenantDescribeResponse, ApiError> {
+        let locked = self.inner.read().unwrap();
+
+        self.tenant_describe_impl(
+            locked
+                .tenants
+                .range(TenantShardId::tenant_range(tenant_id))
+                .map(|(_k, v)| v),
+        )
+        .ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into()))
+    }
+
+    pub(crate) fn tenant_list(&self) -> Vec<TenantDescribeResponse> {
+        let locked = self.inner.read().unwrap();
+
+        let mut result = Vec::new();
+        for (_tenant_id, tenant_shards) in
+            &locked.tenants.iter().group_by(|(id, _shard)| id.tenant_id)
+        {
+            result.push(
+                self.tenant_describe_impl(tenant_shards.map(|(_k, v)| v))
+                    .expect("Groups are always non-empty"),
+            );
+        }
+
+        result
+    }
+
     #[instrument(skip_all, fields(tenant_id=%op.tenant_id))]
     async fn abort_tenant_shard_split(
         &self,
@@ -3012,7 +3038,7 @@ impl Service {
                         },
                     );
 
-                    let mut child_state = TenantState::new(child, child_shard, policy.clone());
+                    let mut child_state = TenantShard::new(child, child_shard, policy.clone());
                     child_state.intent = IntentState::single(scheduler, Some(pageserver));
                     child_state.observed = ObservedState {
                         locations: child_observed,
@@ -3020,7 +3046,7 @@ impl Service {
                     child_state.generation = Some(generation);
                     child_state.config = config.clone();
 
-                    // The child's TenantState::splitting is intentionally left at the default value of Idle,
+                    // The child's TenantShard::splitting is intentionally left at the default value of Idle,
                     // as at this point in the split process we have succeeded and this part is infallible:
                     // we will never need to do any special recovery from this state.
 
@@ -3569,8 +3595,8 @@ impl Service {
         Ok(())
     }
 
-    /// For debug/support: a full JSON dump of TenantStates.  Returns a response so that
-    /// we don't have to make TenantState clonable in the return path.
+    /// For debug/support: a full JSON dump of TenantShards.  Returns a response so that
+    /// we don't have to make TenantShard clonable in the return path.
     pub(crate) fn tenants_dump(&self) -> Result<hyper::Response<hyper::Body>, ApiError> {
         let serialized = {
             let locked = self.inner.read().unwrap();
@@ -3674,7 +3700,7 @@ impl Service {
     }
 
     /// For debug/support: a JSON dump of the [`Scheduler`].  Returns a response so that
-    /// we don't have to make TenantState clonable in the return path.
+    /// we don't have to make TenantShard clonable in the return path.
     pub(crate) fn scheduler_dump(&self) -> Result<hyper::Response<hyper::Body>, ApiError> {
         let serialized = {
             let locked = self.inner.read().unwrap();
@@ -3891,8 +3917,8 @@ impl Service {
                 tracing::info!("Node {} transition to offline", node_id);
                 let mut tenants_affected: usize = 0;
 
-                for (tenant_shard_id, tenant_state) in tenants {
-                    if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) {
+                for (tenant_shard_id, tenant_shard) in tenants {
+                    if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) {
                         // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
                         // not assume our knowledge of the node's configuration is accurate until it comes back online
                         observed_loc.conf = None;
@@ -3905,24 +3931,24 @@ impl Service {
                         continue;
                     }
 
-                    if tenant_state.intent.demote_attached(node_id) {
-                        tenant_state.sequence = tenant_state.sequence.next();
+                    if tenant_shard.intent.demote_attached(node_id) {
+                        tenant_shard.sequence = tenant_shard.sequence.next();
 
                         // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters
                         // for tenants without secondary locations: if they have a secondary location, then this
                         // schedule() call is just promoting an existing secondary)
                         let mut schedule_context = ScheduleContext::default();
 
-                        match tenant_state.schedule(scheduler, &mut schedule_context) {
+                        match tenant_shard.schedule(scheduler, &mut schedule_context) {
                             Err(e) => {
                                 // It is possible that some tenants will become unschedulable when too many pageservers
                                 // go offline: in this case there isn't much we can do other than make the issue observable.
-                                // TODO: give TenantState a scheduling error attribute to be queried later.
+                                // TODO: give TenantShard a scheduling error attribute to be queried later.
                                 tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id);
                             }
                             Ok(()) => {
                                 if self
-                                    .maybe_reconcile_shard(tenant_state, &new_nodes)
+                                    .maybe_reconcile_shard(tenant_shard, &new_nodes)
                                     .is_some()
                                 {
                                     tenants_affected += 1;
@@ -3941,10 +3967,10 @@ impl Service {
                 tracing::info!("Node {} transition to active", node_id);
                 // When a node comes back online, we must reconcile any tenant that has a None observed
                 // location on the node.
-                for tenant_state in locked.tenants.values_mut() {
-                    if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) {
+                for tenant_shard in locked.tenants.values_mut() {
+                    if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) {
                         if observed_loc.conf.is_none() {
-                            self.maybe_reconcile_shard(tenant_state, &new_nodes);
+                            self.maybe_reconcile_shard(tenant_shard, &new_nodes);
                         }
                     }
                 }
@@ -4027,11 +4053,11 @@ impl Service {
         Ok(())
     }
 
-    /// Convenience wrapper around [`TenantState::maybe_reconcile`] that provides
+    /// Convenience wrapper around [`TenantShard::maybe_reconcile`] that provides
     /// all the references to parts of Self that are needed
     fn maybe_reconcile_shard(
         &self,
-        shard: &mut TenantState,
+        shard: &mut TenantShard,
         nodes: &Arc<HashMap<NodeId, Node>>,
     ) -> Option<ReconcilerWaiter> {
         shard.maybe_reconcile(
@@ -4097,7 +4123,7 @@ impl Service {
 
         let mut reconciles_spawned = 0;
 
-        let mut tenant_shards: Vec<&TenantState> = Vec::new();
+        let mut tenant_shards: Vec<&TenantShard> = Vec::new();
 
         // Limit on how many shards' optmizations each call to this function will execute.  Combined
         // with the frequency of background calls, this acts as an implicit rate limit that runs a small
@@ -4228,7 +4254,7 @@ impl Service {
 
     pub async fn shutdown(&self) {
         // Note that this already stops processing any results from reconciles: so
-        // we do not expect that our [`TenantState`] objects will reach a neat
+        // we do not expect that our [`TenantShard`] objects will reach a neat
         // final state.
         self.cancel.cancel();
 
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/storage_controller/src/tenant_shard.rs
similarity index 96%
rename from control_plane/attachment_service/src/tenant_state.rs
rename to storage_controller/src/tenant_shard.rs
index 6717b8e178..58b8ef8d5d 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -50,7 +50,7 @@ where
 /// This struct implement Serialize for debugging purposes, but is _not_ persisted
 /// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
 #[derive(Serialize)]
-pub(crate) struct TenantState {
+pub(crate) struct TenantShard {
     pub(crate) tenant_shard_id: TenantShardId,
 
     pub(crate) shard: ShardIdentity,
@@ -354,7 +354,7 @@ pub(crate) struct ReconcilerHandle {
 }
 
 /// When a reconcile task completes, it sends this result object
-/// to be applied to the primary TenantState.
+/// to be applied to the primary TenantShard.
 pub(crate) struct ReconcileResult {
     pub(crate) sequence: Sequence,
     /// On errors, `observed` should be treated as an incompleted description
@@ -367,7 +367,7 @@ pub(crate) struct ReconcileResult {
     pub(crate) generation: Option<Generation>,
     pub(crate) observed: ObservedState,
 
-    /// Set [`TenantState::pending_compute_notification`] from this flag
+    /// Set [`TenantShard::pending_compute_notification`] from this flag
     pub(crate) pending_compute_notification: bool,
 }
 
@@ -379,7 +379,7 @@ impl ObservedState {
     }
 }
 
-impl TenantState {
+impl TenantShard {
     pub(crate) fn new(
         tenant_shard_id: TenantShardId,
         shard: ShardIdentity,
@@ -1143,7 +1143,7 @@ pub(crate) mod tests {
 
     use super::*;
 
-    fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState {
+    fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard {
         let tenant_id = TenantId::generate();
         let shard_number = ShardNumber(0);
         let shard_count = ShardCount::new(1);
@@ -1153,7 +1153,7 @@ pub(crate) mod tests {
             shard_number,
             shard_count,
         };
-        TenantState::new(
+        TenantShard::new(
             tenant_shard_id,
             ShardIdentity::new(
                 shard_number,
@@ -1165,7 +1165,7 @@ pub(crate) mod tests {
         )
     }
 
-    fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantState> {
+    fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantShard> {
         let tenant_id = TenantId::generate();
 
         (0..shard_count.count())
@@ -1177,7 +1177,7 @@ pub(crate) mod tests {
                     shard_number,
                     shard_count,
                 };
-                TenantState::new(
+                TenantShard::new(
                     tenant_shard_id,
                     ShardIdentity::new(
                         shard_number,
@@ -1202,24 +1202,24 @@ pub(crate) mod tests {
         let mut scheduler = Scheduler::new(nodes.values());
         let mut context = ScheduleContext::default();
 
-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
-        tenant_state
+        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        tenant_shard
             .schedule(&mut scheduler, &mut context)
             .expect("we have enough nodes, scheduling should work");
 
         // Expect to initially be schedule on to different nodes
-        assert_eq!(tenant_state.intent.secondary.len(), 1);
-        assert!(tenant_state.intent.attached.is_some());
+        assert_eq!(tenant_shard.intent.secondary.len(), 1);
+        assert!(tenant_shard.intent.attached.is_some());
 
-        let attached_node_id = tenant_state.intent.attached.unwrap();
-        let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap();
+        let attached_node_id = tenant_shard.intent.attached.unwrap();
+        let secondary_node_id = *tenant_shard.intent.secondary.iter().last().unwrap();
         assert_ne!(attached_node_id, secondary_node_id);
 
         // Notifying the attached node is offline should demote it to a secondary
-        let changed = tenant_state.intent.demote_attached(attached_node_id);
+        let changed = tenant_shard.intent.demote_attached(attached_node_id);
         assert!(changed);
-        assert!(tenant_state.intent.attached.is_none());
-        assert_eq!(tenant_state.intent.secondary.len(), 2);
+        assert!(tenant_shard.intent.attached.is_none());
+        assert_eq!(tenant_shard.intent.secondary.len(), 2);
 
         // Update the scheduler state to indicate the node is offline
         nodes
@@ -1229,18 +1229,18 @@ pub(crate) mod tests {
         scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
 
         // Scheduling the node should promote the still-available secondary node to attached
-        tenant_state
+        tenant_shard
             .schedule(&mut scheduler, &mut context)
             .expect("active nodes are available");
-        assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);
+        assert_eq!(tenant_shard.intent.attached.unwrap(), secondary_node_id);
 
         // The original attached node should have been retained as a secondary
         assert_eq!(
-            *tenant_state.intent.secondary.iter().last().unwrap(),
+            *tenant_shard.intent.secondary.iter().last().unwrap(),
             attached_node_id
         );
 
-        tenant_state.intent.clear(&mut scheduler);
+        tenant_shard.intent.clear(&mut scheduler);
 
         Ok(())
     }
@@ -1250,48 +1250,48 @@ pub(crate) mod tests {
         let nodes = make_test_nodes(3);
         let mut scheduler = Scheduler::new(nodes.values());
 
-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
 
-        tenant_state.observed.locations.insert(
+        tenant_shard.observed.locations.insert(
             NodeId(3),
             ObservedStateLocation {
                 conf: Some(LocationConfig {
                     mode: LocationConfigMode::AttachedMulti,
                     generation: Some(2),
                     secondary_conf: None,
-                    shard_number: tenant_state.shard.number.0,
-                    shard_count: tenant_state.shard.count.literal(),
-                    shard_stripe_size: tenant_state.shard.stripe_size.0,
+                    shard_number: tenant_shard.shard.number.0,
+                    shard_count: tenant_shard.shard.count.literal(),
+                    shard_stripe_size: tenant_shard.shard.stripe_size.0,
                     tenant_conf: TenantConfig::default(),
                 }),
             },
         );
 
-        tenant_state.observed.locations.insert(
+        tenant_shard.observed.locations.insert(
             NodeId(2),
             ObservedStateLocation {
                 conf: Some(LocationConfig {
                     mode: LocationConfigMode::AttachedStale,
                     generation: Some(1),
                     secondary_conf: None,
-                    shard_number: tenant_state.shard.number.0,
-                    shard_count: tenant_state.shard.count.literal(),
-                    shard_stripe_size: tenant_state.shard.stripe_size.0,
+                    shard_number: tenant_shard.shard.number.0,
+                    shard_count: tenant_shard.shard.count.literal(),
+                    shard_stripe_size: tenant_shard.shard.stripe_size.0,
                     tenant_conf: TenantConfig::default(),
                 }),
             },
         );
 
-        tenant_state.intent_from_observed(&mut scheduler);
+        tenant_shard.intent_from_observed(&mut scheduler);
 
         // The highest generationed attached location gets used as attached
-        assert_eq!(tenant_state.intent.attached, Some(NodeId(3)));
+        assert_eq!(tenant_shard.intent.attached, Some(NodeId(3)));
         // Other locations get used as secondary
-        assert_eq!(tenant_state.intent.secondary, vec![NodeId(2)]);
+        assert_eq!(tenant_shard.intent.secondary, vec![NodeId(2)]);
 
-        scheduler.consistency_check(nodes.values(), [&tenant_state].into_iter())?;
+        scheduler.consistency_check(nodes.values(), [&tenant_shard].into_iter())?;
 
-        tenant_state.intent.clear(&mut scheduler);
+        tenant_shard.intent.clear(&mut scheduler);
         Ok(())
     }
 
@@ -1300,23 +1300,23 @@ pub(crate) mod tests {
         let nodes = make_test_nodes(3);
         let mut scheduler = Scheduler::new(nodes.values());
 
-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
 
         // In pause mode, schedule() shouldn't do anything
-        tenant_state.scheduling_policy = ShardSchedulingPolicy::Pause;
-        assert!(tenant_state
+        tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause;
+        assert!(tenant_shard
             .schedule(&mut scheduler, &mut ScheduleContext::default())
             .is_ok());
-        assert!(tenant_state.intent.all_pageservers().is_empty());
+        assert!(tenant_shard.intent.all_pageservers().is_empty());
 
         // In active mode, schedule() works
-        tenant_state.scheduling_policy = ShardSchedulingPolicy::Active;
-        assert!(tenant_state
+        tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active;
+        assert!(tenant_shard
             .schedule(&mut scheduler, &mut ScheduleContext::default())
             .is_ok());
-        assert!(!tenant_state.intent.all_pageservers().is_empty());
+        assert!(!tenant_shard.intent.all_pageservers().is_empty());
 
-        tenant_state.intent.clear(&mut scheduler);
+        tenant_shard.intent.clear(&mut scheduler);
         Ok(())
     }
 
@@ -1429,7 +1429,7 @@ pub(crate) mod tests {
     fn optimize_til_idle(
         nodes: &HashMap<NodeId, Node>,
         scheduler: &mut Scheduler,
-        shards: &mut [TenantState],
+        shards: &mut [TenantShard],
     ) {
         let mut loop_n = 0;
         loop {
diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py
index e7959c1764..c32748f6f0 100644
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -482,20 +482,18 @@ def pytest_terminal_summary(
                 terminalreporter.section("Benchmark results", "-")
                 is_header_printed = True
 
-            terminalreporter.write(
-                "{}.{}: ".format(test_report.head_line, recorded_property["name"])
-            )
+            terminalreporter.write(f"{test_report.head_line}.{recorded_property['name']}: ")
             unit = recorded_property["unit"]
             value = recorded_property["value"]
             if unit == "MB":
-                terminalreporter.write("{0:,.0f}".format(value), green=True)
+                terminalreporter.write(f"{value:,.0f}", green=True)
             elif unit in ("s", "ms") and isinstance(value, float):
-                terminalreporter.write("{0:,.3f}".format(value), green=True)
+                terminalreporter.write(f"{value:,.3f}", green=True)
             elif isinstance(value, float):
-                terminalreporter.write("{0:,.4f}".format(value), green=True)
+                terminalreporter.write(f"{value:,.4f}", green=True)
             else:
                 terminalreporter.write(str(value), green=True)
-            terminalreporter.line(" {}".format(unit))
+            terminalreporter.line(f" {unit}")
 
             result_entry.append(recorded_property)
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d0519d3406..0e4a58c099 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -520,9 +520,9 @@ class NeonEnvBuilder:
         self.env = NeonEnv(self)
         return self.env
 
-    def start(self, register_pageservers=False):
+    def start(self):
         assert self.env is not None, "environment is not already initialized, call init() first"
-        self.env.start(register_pageservers=register_pageservers)
+        self.env.start()
 
     def init_start(
         self,
@@ -1115,8 +1115,8 @@ class NeonEnv:
         log.info(f"Config: {cfg}")
         self.neon_cli.init(cfg, force=config.config_init_force)
 
-    def start(self, register_pageservers=False):
-        # storage controller starts first, so that pageserver /re-attach calls don't
+    def start(self):
+        # Storage controller starts first, so that pageserver /re-attach calls don't
         # bounce through retries on startup
         self.storage_controller.start()
 
@@ -1127,11 +1127,6 @@ class NeonEnv:
         # reconcile.
         wait_until(30, 1, storage_controller_ready)
 
-        if register_pageservers:
-            # Special case for forward compat tests, this can be removed later.
-            for pageserver in self.pageservers:
-                self.storage_controller.node_register(pageserver)
-
         # Start up broker, pageserver and all safekeepers
         futs = []
         with concurrent.futures.ThreadPoolExecutor(
@@ -3605,7 +3600,7 @@ class Safekeeper:
         return self
 
     def stop(self, immediate: bool = False) -> "Safekeeper":
-        log.info("Stopping safekeeper {}".format(self.id))
+        log.info(f"Stopping safekeeper {self.id}")
         self.env.neon_cli.safekeeper_stop(self.id, immediate)
         self.running = False
         return self
@@ -4037,13 +4032,13 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
     for f in mismatch:
         f1 = os.path.join(endpoint.pgdata_dir, f)
         f2 = os.path.join(restored_dir_path, f)
-        stdout_filename = "{}.filediff".format(f2)
+        stdout_filename = f"{f2}.filediff"
 
         with open(stdout_filename, "w") as stdout_f:
-            subprocess.run("xxd -b {} > {}.hex ".format(f1, f1), shell=True)
-            subprocess.run("xxd -b {} > {}.hex ".format(f2, f2), shell=True)
+            subprocess.run(f"xxd -b {f1} > {f1}.hex ", shell=True)
+            subprocess.run(f"xxd -b {f2} > {f2}.hex ", shell=True)
 
-            cmd = "diff {}.hex {}.hex".format(f1, f2)
+            cmd = f"diff {f1}.hex {f2}.hex"
             subprocess.run([cmd], stdout=stdout_f, shell=True)
 
     assert (mismatch, error) == ([], [])
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 6aebfbc99c..b899b0dac8 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -308,6 +308,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
             params=params,
         )
         self.verbose_error(res)
+        return res.json()
 
     def tenant_list_locations(self):
         res = self.get(
@@ -341,8 +342,21 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore")
         self.verbose_error(res)
 
-    def tenant_status(self, tenant_id: Union[TenantId, TenantShardId]) -> Dict[Any, Any]:
-        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
+    def tenant_status(
+        self, tenant_id: Union[TenantId, TenantShardId], activate: bool = False
+    ) -> Dict[Any, Any]:
+        """
+        :activate: hint the server not to accelerate activation of this tenant in response
+        to this query.  False by default for tests, because they generally want to observed the
+        system rather than interfering with it.  This is true  by default on the server side,
+        because in the field if the control plane is GET'ing a tenant it's a sign that it wants
+        to do something with it.
+        """
+        params = {}
+        if not activate:
+            params["activate"] = "false"
+
+        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}", params=params)
         self.verbose_error(res)
         res_json = res.json()
         assert isinstance(res_json, dict)
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 693771dd3d..4b0dd7a815 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -204,13 +204,11 @@ def wait_for_last_record_lsn(
             return current_lsn
         if i % 10 == 0:
             log.info(
-                "{}/{} waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
-                    tenant, timeline, lsn, current_lsn, i + 1
-                )
+                f"{tenant}/{timeline} waiting for last_record_lsn to reach {lsn}, now {current_lsn}, iteration {i + 1}"
             )
         time.sleep(0.1)
     raise Exception(
-        "timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn)
+        f"timed out while waiting for last_record_lsn to reach {lsn}, was {current_lsn}"
     )
 
 
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index ab8717de54..c44628ce06 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -81,9 +81,13 @@ class Workload:
 
         return self._endpoint
 
-    def __del__(self):
+    def stop(self):
         if self._endpoint is not None:
             self._endpoint.stop()
+            self._endpoint = None
+
+    def __del__(self):
+        self.stop()
 
     def init(self, pageserver_id: Optional[int] = None):
         endpoint = self.endpoint(pageserver_id)
diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
index 324ef0d516..b66db4d0ab 100644
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -125,19 +125,19 @@ async def run_update_loop_worker(ep: Endpoint, n_txns: int, idx: int):
     await conn.execute(f"ALTER TABLE {table} SET (autovacuum_enabled = false)")
     await conn.execute(f"INSERT INTO {table} VALUES (1, 0)")
     await conn.execute(
+        f"""
+        CREATE PROCEDURE updating{table}() as
+        $$
+            DECLARE
+            i integer;
+            BEGIN
+            FOR i IN 1..{n_txns} LOOP
+                UPDATE {table} SET x = x + 1 WHERE pk=1;
+                COMMIT;
+            END LOOP;
+            END
+        $$ LANGUAGE plpgsql
         """
-         CREATE PROCEDURE updating{0}() as
-         $$
-             DECLARE
-             i integer;
-             BEGIN
-             FOR i IN 1..{1} LOOP
-                 UPDATE {0} SET x = x + 1 WHERE pk=1;
-                 COMMIT;
-             END LOOP;
-             END
-         $$ LANGUAGE plpgsql
-         """.format(table, n_txns)
     )
     await conn.execute("SET statement_timeout=0")
     await conn.execute(f"call updating{table}()")
diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py
index 9777bf6748..54905759bd 100644
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -78,7 +78,7 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int)
         p = random.randint(0, i)
 
         timer = timeit.default_timer()
-        env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p), tenant_id=tenant)
+        env.neon_cli.create_branch(f"b{i + 1}", f"b{p}", tenant_id=tenant)
         dur = timeit.default_timer() - timer
 
         log.info(f"Creating branch b{i+1} took {dur}s")
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 2a7a3c41ac..5b69649007 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -84,11 +84,11 @@ def test_branching_with_pgbench(
             threads = []
 
         if ty == "cascade":
-            env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(i), tenant_id=tenant)
+            env.neon_cli.create_branch(f"b{i + 1}", f"b{i}", tenant_id=tenant)
         else:
-            env.neon_cli.create_branch("b{}".format(i + 1), "b0", tenant_id=tenant)
+            env.neon_cli.create_branch(f"b{i + 1}", "b0", tenant_id=tenant)
 
-        endpoints.append(env.endpoints.create_start("b{}".format(i + 1), tenant_id=tenant))
+        endpoints.append(env.endpoints.create_start(f"b{i + 1}", tenant_id=tenant))
 
         threads.append(
             threading.Thread(target=run_pgbench, args=(endpoints[-1].connstr(),), daemon=True)
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 5406acc005..208263a22a 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -192,6 +192,9 @@ def test_backward_compatibility(
     assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
 
 
+# Forward compatibility is broken due to https://github.com/neondatabase/neon/pull/6530
+# The test is disabled until the next release deployment
+@pytest.mark.xfail
 @check_ondisk_data_compatibility_if_enabled
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")
@@ -226,10 +229,6 @@ def test_forward_compatibility(
     )
 
     try:
-        # TODO: remove this once the previous pageserrver version understands
-        # the 'get_vectored_impl' config
-        neon_env_builder.pageserver_get_vectored_impl = None
-
         neon_env_builder.num_safekeepers = 3
         neon_local_binpath = neon_env_builder.neon_binpath
         env = neon_env_builder.from_repo_dir(
@@ -238,15 +237,11 @@ def test_forward_compatibility(
             pg_distrib_dir=compatibility_postgres_distrib_dir,
         )
 
-        # TODO: remove this workaround after release-5090 is no longer the most recent release.
-        # There was a bug in that code that generates a warning in the storage controller log.
-        env.storage_controller.allowed_errors.append(".*no tenant_shard_id specified.*")
-
         # Use current neon_local even though we're using old binaries for
         # everything else: our test code is written for latest CLI args.
         env.neon_local_binpath = neon_local_binpath
 
-        neon_env_builder.start(register_pageservers=True)
+        neon_env_builder.start()
 
         check_neon_works(
             env,
diff --git a/test_runner/regress/test_large_schema.py b/test_runner/regress/test_large_schema.py
index b6ac1aa41f..c5d5b5fe64 100644
--- a/test_runner/regress/test_large_schema.py
+++ b/test_runner/regress/test_large_schema.py
@@ -74,8 +74,8 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder):
     cur.execute("select * from pg_depend order by refclassid, refobjid, refobjsubid")
 
     # Check layer file sizes
-    timeline_path = "{}/tenants/{}/timelines/{}/".format(
-        env.pageserver.workdir, env.initial_tenant, env.initial_timeline
+    timeline_path = (
+        f"{env.pageserver.workdir}/tenants/{env.initial_tenant}/timelines/{env.initial_timeline}/"
     )
     for filename in os.listdir(timeline_path):
         if filename.startswith("00000"):
diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py
index 2fdee89389..77dc8a35b5 100644
--- a/test_runner/regress/test_layer_bloating.py
+++ b/test_runner/regress/test_layer_bloating.py
@@ -57,9 +57,7 @@ def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg):
     time.sleep(10)
 
     # Check layer file sizes
-    timeline_path = "{}/tenants/{}/timelines/{}/".format(
-        env.pageserver.workdir, env.initial_tenant, timeline
-    )
+    timeline_path = f"{env.pageserver.workdir}/tenants/{env.initial_tenant}/timelines/{timeline}/"
     log.info(f"Check {timeline_path}")
     for filename in os.listdir(timeline_path):
         if filename.startswith("00000"):
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 41fa03cdf8..67f68a62af 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -9,7 +9,6 @@ of the pageserver are:
 - Updates to remote_consistent_lsn may only be made visible after validating generation
 """
 
-
 import enum
 import re
 import time
@@ -23,6 +22,7 @@ from fixtures.neon_fixtures import (
     NeonPageserver,
     PgBin,
     S3Scrubber,
+    flush_ep_to_pageserver,
     last_flush_lsn_upload,
 )
 from fixtures.pageserver.http import PageserverApiException
@@ -31,6 +31,7 @@ from fixtures.pageserver.utils import (
     list_prefix,
     wait_for_last_record_lsn,
     wait_for_upload,
+    wait_for_upload_queue_empty,
 )
 from fixtures.remote_storage import (
     RemoteStorageKind,
@@ -112,7 +113,6 @@ def generate_uploads_and_deletions(
             last_flush_lsn_upload(
                 env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
             )
-            ps_http.timeline_checkpoint(tenant_id, timeline_id)
 
         # Compaction should generate some GC-elegible layers
         for i in range(0, 2):
@@ -122,6 +122,17 @@ def generate_uploads_and_deletions(
         print_gc_result(gc_result)
         assert gc_result["layers_removed"] > 0
 
+        # Stop endpoint and flush all data to pageserver, then checkpoint it: this
+        # ensures that the pageserver is in a fully idle state: there will be no more
+        # background ingest, no more uploads pending, and therefore no non-determinism
+        # in subsequent actions like pageserver restarts.
+        final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id)
+        ps_http.timeline_checkpoint(tenant_id, timeline_id)
+        # Finish uploads
+        wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn)
+        # Finish all remote writes (including deletions)
+        wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
+
 
 def read_all(
     env: NeonEnv, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None
@@ -386,9 +397,8 @@ def test_deletion_queue_recovery(
     if validate_before == ValidateBefore.NO_VALIDATE:
         failpoints.append(
             # Prevent deletion lists from being validated, we will test that they are
-            # dropped properly during recovery.  'pause' is okay here because we kill
-            # the pageserver with immediate=true
-            ("control-plane-client-validate", "pause")
+            # dropped properly during recovery.  This is such a long sleep as to be equivalent to "never"
+            ("control-plane-client-validate", "return(3600000)")
         )
 
     ps_http.configure_failpoints(failpoints)
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index ca6f77c75f..345abdc072 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -498,9 +498,19 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
 
     ps_secondary.http_client().tenant_secondary_download(tenant_id)
 
-    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
-        ps_secondary, tenant_id, timeline_id
-    )
+    try:
+        assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
+            ps_secondary, tenant_id, timeline_id
+        )
+    except:
+        # Do a full listing of the secondary location on errors, to help debug of
+        # https://github.com/neondatabase/neon/issues/6966
+        timeline_path = ps_secondary.timeline_dir(tenant_id, timeline_id)
+        for path, _dirs, files in os.walk(timeline_path):
+            for f in files:
+                log.info(f"Secondary file: {os.path.join(path, f)}")
+
+        raise
 
     # FIXME: this sleep is needed to avoid on-demand promotion of the layers we evict, while
     # walreceiver is still doing something.
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 3e986a8f7b..f446f4f200 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -566,38 +566,6 @@ async def test_sql_over_http2(static_proxy: NeonProxy):
     assert resp["rows"] == [{"answer": 42}]
 
 
-def test_sql_over_http_timeout_cancel(static_proxy: NeonProxy):
-    static_proxy.safe_psql("create role http with login password 'http' superuser")
-
-    static_proxy.safe_psql("create table test_table ( id int primary key )")
-
-    # insert into a table, with a unique constraint, after sleeping for n seconds
-    query = "WITH temp AS ( \
-        SELECT pg_sleep($1) as sleep, $2::int as id \
-    ) INSERT INTO test_table (id) SELECT id FROM temp"
-
-    # expect to fail with timeout
-    res = static_proxy.http_query(
-        query,
-        [static_proxy.http_timeout_seconds + 1, 1],
-        user="http",
-        password="http",
-        expected_code=400,
-    )
-    assert "Query cancelled, runtime exceeded" in res["message"], "HTTP query should time out"
-
-    time.sleep(2)
-
-    res = static_proxy.http_query(query, [1, 1], user="http", password="http", expected_code=200)
-    assert res["command"] == "INSERT", "HTTP query should insert"
-    assert res["rowCount"] == 1, "HTTP query should insert"
-
-    res = static_proxy.http_query(query, [0, 1], user="http", password="http", expected_code=400)
-    assert (
-        "duplicate key value violates unique constraint" in res["message"]
-    ), "HTTP query should conflict"
-
-
 def test_sql_over_http_connection_cancel(static_proxy: NeonProxy):
     static_proxy.safe_psql("create role http with login password 'http' superuser")
 
diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py
index effb7e83f9..868b80a561 100644
--- a/test_runner/regress/test_read_validation.py
+++ b/test_runner/regress/test_read_validation.py
@@ -22,7 +22,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
     with closing(endpoint.connect()) as con:
         with con.cursor() as c:
             for e in extensions:
-                c.execute("create extension if not exists {};".format(e))
+                c.execute(f"create extension if not exists {e};")
 
             c.execute("create table foo (c int) with (autovacuum_enabled = false)")
             c.execute("insert into foo values (1)")
@@ -42,14 +42,12 @@ def test_read_validation(neon_simple_env: NeonEnv):
             log.info("Test table is populated, validating buffer cache")
 
             cache_entries = query_scalar(
-                c, "select count(*) from pg_buffercache where relfilenode =  {}".format(relfilenode)
+                c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}"
             )
             assert cache_entries > 0, "No buffers cached for the test relation"
 
             c.execute(
-                "select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {}".format(
-                    relfilenode
-                )
+                f"select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {relfilenode}"
             )
             reln = c.fetchone()
             assert reln is not None
@@ -59,22 +57,20 @@ def test_read_validation(neon_simple_env: NeonEnv):
             c.execute("select clear_buffer_cache()")
 
             cache_entries = query_scalar(
-                c, "select count(*) from pg_buffercache where relfilenode =  {}".format(relfilenode)
+                c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}"
             )
             assert cache_entries == 0, "Failed to clear buffer cache"
 
             log.info("Cache is clear, reading stale page version")
 
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{}'))".format(
-                    first[0]
-                )
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{first[0]}'))"
             )
             direct_first = c.fetchone()
             assert first == direct_first, "Failed fetch page at historic lsn"
 
             cache_entries = query_scalar(
-                c, "select count(*) from pg_buffercache where relfilenode =  {}".format(relfilenode)
+                c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}"
             )
             assert cache_entries == 0, "relation buffers detected after invalidation"
 
@@ -87,7 +83,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             assert second == direct_latest, "Failed fetch page at latest lsn"
 
             cache_entries = query_scalar(
-                c, "select count(*) from pg_buffercache where relfilenode =  {}".format(relfilenode)
+                c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}"
             )
             assert cache_entries == 0, "relation buffers detected after invalidation"
 
@@ -96,9 +92,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             )
 
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))".format(
-                    reln[0], reln[1], reln[2], first[0]
-                )
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}'))"
             )
             direct_first = c.fetchone()
             assert first == direct_first, "Failed fetch page at historic lsn using oid"
@@ -108,9 +102,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             )
 
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, NULL ))".format(
-                    reln[0], reln[1], reln[2]
-                )
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, NULL))"
             )
             direct_latest = c.fetchone()
             assert second == direct_latest, "Failed fetch page at latest lsn"
@@ -122,9 +114,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             )
 
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))".format(
-                    reln[0], reln[1], reln[2], first[0]
-                )
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}'))"
             )
             direct_first = c.fetchone()
             assert first == direct_first, "Failed fetch page at historic lsn using oid"
@@ -134,7 +124,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
                 c.execute("select * from page_header(get_raw_page('foo', 'main', 0));")
                 raise AssertionError("query should have failed")
             except UndefinedTable as e:
-                log.info("Caught an expected failure: {}".format(e))
+                log.info(f"Caught an expected failure: {e}")
 
 
 def test_read_validation_neg(neon_simple_env: NeonEnv):
@@ -148,7 +138,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
     with closing(endpoint.connect()) as con:
         with con.cursor() as c:
             for e in extensions:
-                c.execute("create extension if not exists {};".format(e))
+                c.execute(f"create extension if not exists {e};")
 
             log.info("read a page of a missing relation")
             try:
@@ -157,7 +147,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
                 )
                 raise AssertionError("query should have failed")
             except UndefinedTable as e:
-                log.info("Caught an expected failure: {}".format(e))
+                log.info(f"Caught an expected failure: {e}")
 
             c.execute("create table foo (c int) with (autovacuum_enabled = false)")
             c.execute("insert into foo values (1)")
@@ -169,7 +159,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
                 )
                 raise AssertionError("query should have failed")
             except IoError as e:
-                log.info("Caught an expected failure: {}".format(e))
+                log.info(f"Caught an expected failure: {e}")
 
             log.info("Pass NULL as an input")
             expected = (None, None, None)
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 2699654f80..bfaab9125f 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -10,9 +10,13 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
+    S3Scrubber,
     StorageControllerApiException,
+    last_flush_lsn_upload,
     tenant_get_shards,
+    wait_for_last_flush_lsn,
 )
+from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty
 from fixtures.remote_storage import s3_storage
 from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import wait_until
@@ -67,6 +71,15 @@ def test_sharding_smoke(
         log.info(f"sizes = {sizes}")
         return sizes
 
+    # The imported initdb for timeline creation should
+    # not be fully imported on every shard.  We use a 1MB strripe size so expect
+    # pretty good distribution: no one shard should have more than half the data
+    sizes = get_sizes()
+    physical_initdb_total = sum(sizes.values())
+    expect_initdb_size = 20 * 1024 * 1024
+    assert physical_initdb_total > expect_initdb_size
+    assert all(s < expect_initdb_size // 2 for s in sizes.values())
+
     # Test that timeline creation works on a sharded tenant
     timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id)
 
@@ -99,6 +112,38 @@ def test_sharding_smoke(
 
     env.storage_controller.consistency_check()
 
+    # Validate that deleting a sharded tenant removes all files in the prefix
+
+    # Before deleting, stop the client and check we have some objects to delete
+    workload.stop()
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )
+
+    # Check the scrubber isn't confused by sharded content, then disable
+    # it during teardown because we'll have deleted by then
+    S3Scrubber(neon_env_builder).scan_metadata()
+    neon_env_builder.scrub_on_exit = False
+
+    env.storage_controller.pageserver_api().tenant_delete(tenant_id)
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )
+
+    env.storage_controller.consistency_check()
+
 
 def test_sharding_split_unsharded(
     neon_env_builder: NeonEnvBuilder,
@@ -466,13 +511,11 @@ def test_sharding_split_stripe_size(
     os.getenv("BUILD_TYPE") == "debug",
     reason="Avoid running bulkier ingest tests in debug mode",
 )
-def test_sharding_ingest(
+def test_sharding_ingest_layer_sizes(
     neon_env_builder: NeonEnvBuilder,
 ):
     """
-    Check behaviors related to ingest:
-    - That we generate properly sized layers
-    - TODO: that updates to remote_consistent_lsn are made correctly via safekeepers
+    Check that when ingesting data to a sharded tenant, we properly respect layer size limts.
     """
 
     # Set a small stripe size and checkpoint distance, so that we can exercise rolling logic
@@ -503,6 +546,7 @@ def test_sharding_ingest(
     workload.write_rows(4096, upload=False)
     workload.write_rows(4096, upload=False)
     workload.write_rows(4096, upload=False)
+
     workload.validate()
 
     small_layer_count = 0
@@ -515,7 +559,9 @@ def test_sharding_ingest(
         shard_id = shard["shard_id"]
         layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id)
 
-        for layer in layer_map.historic_layers:
+        historic_layers = sorted(layer_map.historic_layers, key=lambda layer: layer.lsn_start)
+
+        for layer in historic_layers:
             assert layer.layer_file_size is not None
             if layer.layer_file_size < expect_layer_size // 2:
                 classification = "Small"
@@ -552,6 +598,93 @@ def test_sharding_ingest(
     assert huge_layer_count <= shard_count
 
 
+def test_sharding_ingest_gaps(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Check ingest behavior when the incoming data results in some shards having gaps where
+    no data is ingested: they should advance their disk_consistent_lsn and remote_consistent_lsn
+    even if they aren't writing out layers.
+    """
+
+    # Set a small stripe size and checkpoint distance, so that we can exercise rolling logic
+    # without writing a lot of data.
+    expect_layer_size = 131072
+    checkpoint_interval_secs = 5
+    TENANT_CONF = {
+        # small checkpointing and compaction targets to ensure we generate many upload operations
+        "checkpoint_distance": f"{expect_layer_size}",
+        "compaction_target_size": f"{expect_layer_size}",
+        # Set a short checkpoint interval as we will wait for uploads to happen
+        "checkpoint_timeout": f"{checkpoint_interval_secs}s",
+        # Background checkpointing is done from compaction loop, so set that interval short too
+        "compaction_period": "1s",
+    }
+    shard_count = 4
+    neon_env_builder.num_pageservers = shard_count
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF,
+        initial_tenant_shard_count=shard_count,
+        initial_tenant_shard_stripe_size=128,
+    )
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # Just a few writes: we aim to produce a situation where some shards are skipping
+    # ingesting some records and thereby won't have layer files that advance their
+    # consistent LSNs, to exercise the code paths that explicitly handle this case by
+    # advancing consistent LSNs in the background if there is no open layer.
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(128, upload=False)
+    workload.churn_rows(128, upload=False)
+
+    # Checkpoint, so that we won't get a background checkpoint happening during the next step
+    workload.endpoint().safe_psql("checkpoint")
+    # Freeze + flush, so that subsequent writes will start from a position of no open layers
+    last_flush_lsn_upload(env, workload.endpoint(), tenant_id, timeline_id)
+
+    # This write is tiny: at least some of the shards should find they don't have any
+    # data to ingest.  This will exercise how they handle that.
+    workload.churn_rows(1, upload=False)
+
+    # The LSN that has reached pageservers, but may not have been flushed to historic layers yet
+    expect_lsn = wait_for_last_flush_lsn(env, workload.endpoint(), tenant_id, timeline_id)
+
+    # Don't leave the endpoint running, we don't want it writing in the background
+    workload.stop()
+
+    log.info(f"Waiting for shards' consistent LSNs to reach {expect_lsn}")
+
+    shards = tenant_get_shards(env, tenant_id, None)
+
+    def assert_all_disk_consistent():
+        """
+        Assert that all the shards' disk_consistent_lsns have reached expect_lsn
+        """
+        for tenant_shard_id, pageserver in shards:
+            timeline_detail = pageserver.http_client().timeline_detail(tenant_shard_id, timeline_id)
+            log.info(f"{tenant_shard_id} (ps {pageserver.id}) detail: {timeline_detail}")
+            assert Lsn(timeline_detail["disk_consistent_lsn"]) >= expect_lsn
+
+    # We set a short checkpoint timeout: expect things to get frozen+flushed within that
+    wait_until(checkpoint_interval_secs * 3, 1, assert_all_disk_consistent)
+
+    def assert_all_remote_consistent():
+        """
+        Assert that all the shards' remote_consistent_lsns have reached expect_lsn
+        """
+        for tenant_shard_id, pageserver in shards:
+            timeline_detail = pageserver.http_client().timeline_detail(tenant_shard_id, timeline_id)
+            log.info(f"{tenant_shard_id} (ps {pageserver.id}) detail: {timeline_detail}")
+            assert Lsn(timeline_detail["remote_consistent_lsn"]) >= expect_lsn
+
+    # We set a short checkpoint timeout: expect things to get frozen+flushed within that
+    wait_until(checkpoint_interval_secs * 3, 1, assert_all_remote_consistent)
+
+    workload.validate()
+
+
 class Failure:
     pageserver_id: Optional[int]
 
@@ -795,6 +928,8 @@ def test_sharding_split_failures(
             ".*Reconcile error: receive body: error sending request for url.*",
             # Node offline cases will fail inside reconciler when detaching secondaries
             ".*Reconcile error on shard.*: receive body: error sending request for url.*",
+            # While parent shard's client is stopped during split, flush loop updating LSNs will emit this warning
+            ".*Failed to schedule metadata upload after updating disk_consistent_lsn.*",
         ]
     )
 
diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_storage_controller.py
similarity index 88%
rename from test_runner/regress/test_sharding_service.py
rename to test_runner/regress/test_storage_controller.py
index 5a86e03d2b..840f354142 100644
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1,3 +1,4 @@
+import json
 import time
 from collections import defaultdict
 from datetime import datetime, timezone
@@ -24,7 +25,7 @@ from fixtures.pageserver.utils import (
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
 from fixtures.types import TenantId, TenantShardId, TimelineId
-from fixtures.utils import run_pg_bench_small, wait_until
+from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until
 from mypy_boto3_s3.type_defs import (
     ObjectTypeDef,
 )
@@ -41,11 +42,11 @@ def get_node_shard_counts(env: NeonEnv, tenant_ids):
     return counts
 
 
-def test_sharding_service_smoke(
+def test_storage_controller_smoke(
     neon_env_builder: NeonEnvBuilder,
 ):
     """
-    Test the basic lifecycle of a sharding service:
+    Test the basic lifecycle of a storage controller:
     - Restarting
     - Restarting a pageserver
     - Creating and deleting tenants and timelines
@@ -203,7 +204,7 @@ def test_node_status_after_restart(
     env.storage_controller.consistency_check()
 
 
-def test_sharding_service_passthrough(
+def test_storage_controller_passthrough(
     neon_env_builder: NeonEnvBuilder,
 ):
     """
@@ -230,7 +231,7 @@ def test_sharding_service_passthrough(
     env.storage_controller.consistency_check()
 
 
-def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
+def test_storage_controller_restart(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     tenant_a = env.initial_tenant
     tenant_b = TenantId.generate()
@@ -265,7 +266,7 @@ def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
 
 
 @pytest.mark.parametrize("warm_up", [True, False])
-def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
+def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
     """
     We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
     which provides the /location_config API.  This is similar to creating a tenant,
@@ -302,7 +303,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
     origin_ps.http_client().tenant_create(tenant_id, generation=generation)
 
     # As if doing a live migration, first configure origin into stale mode
-    origin_ps.http_client().tenant_location_conf(
+    r = origin_ps.http_client().tenant_location_conf(
         tenant_id,
         {
             "mode": "AttachedStale",
@@ -311,6 +312,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
             "generation": generation,
         },
     )
+    assert len(r["shards"]) == 1
 
     if warm_up:
         origin_ps.http_client().tenant_heatmap_upload(tenant_id)
@@ -331,7 +333,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
 
     # Call into storage controller to onboard the tenant
     generation += 1
-    virtual_ps_http.tenant_location_conf(
+    r = virtual_ps_http.tenant_location_conf(
         tenant_id,
         {
             "mode": "AttachedMulti",
@@ -340,6 +342,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
             "generation": generation,
         },
     )
+    assert len(r["shards"]) == 1
 
     # As if doing a live migration, detach the original pageserver
     origin_ps.http_client().tenant_location_conf(
@@ -356,7 +359,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
     # set it to AttachedSingle: this is a no-op, but we test it because the
     # cloud control plane may call this for symmetry with live migration to
     # an individual pageserver
-    virtual_ps_http.tenant_location_conf(
+    r = virtual_ps_http.tenant_location_conf(
         tenant_id,
         {
             "mode": "AttachedSingle",
@@ -365,6 +368,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
             "generation": generation,
         },
     )
+    assert len(r["shards"]) == 1
 
     # We should see the tenant is now attached to the pageserver managed
     # by the sharding service
@@ -395,7 +399,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
     # The generation has moved on since we onboarded
     assert generation != dest_tenant_before_conf_change["generation"]
 
-    virtual_ps_http.tenant_location_conf(
+    r = virtual_ps_http.tenant_location_conf(
         tenant_id,
         {
             "mode": "AttachedSingle",
@@ -405,6 +409,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
             "generation": generation,
         },
     )
+    assert len(r["shards"]) == 1
     dest_tenant_after_conf_change = dest_ps.http_client().tenant_status(tenant_id)
     assert (
         dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"]
@@ -415,7 +420,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
     env.storage_controller.consistency_check()
 
 
-def test_sharding_service_compute_hook(
+def test_storage_controller_compute_hook(
     httpserver: HTTPServer,
     neon_env_builder: NeonEnvBuilder,
     httpserver_listen_address,
@@ -528,7 +533,7 @@ def test_sharding_service_compute_hook(
     env.storage_controller.consistency_check()
 
 
-def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
+def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
     """
     Verify that occasional-use debug APIs work as expected.  This is a lightweight test
     that just hits the endpoints to check that they don't bitrot.
@@ -589,7 +594,7 @@ def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
     env.storage_controller.consistency_check()
 
 
-def test_sharding_service_s3_time_travel_recovery(
+def test_storage_controller_s3_time_travel_recovery(
     neon_env_builder: NeonEnvBuilder,
     pg_bin: PgBin,
 ):
@@ -699,7 +704,7 @@ def test_sharding_service_s3_time_travel_recovery(
     env.storage_controller.consistency_check()
 
 
-def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
+def test_storage_controller_auth(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.auth_enabled = True
     env = neon_env_builder.init_start()
     svc = env.storage_controller
@@ -723,13 +728,18 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
         StorageControllerApiException,
         match="Forbidden: JWT authentication error",
     ):
-        svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN))
+        svc.request(
+            "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.SAFEKEEPER_DATA)
+        )
 
     # Token with correct scope
     svc.request(
         "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.PAGE_SERVER_API)
     )
 
+    # Token with admin scope should also be permitted
+    svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN))
+
     # No token
     with pytest.raises(
         StorageControllerApiException,
@@ -763,7 +773,7 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
         )
 
 
-def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder):
+def test_storage_controller_tenant_conf(neon_env_builder: NeonEnvBuilder):
     """
     Validate the pageserver-compatible API endpoints for setting and getting tenant conf, without
     supplying the whole LocationConf.
@@ -866,7 +876,7 @@ def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]:
         PageserverFailpoint(pageserver_id=1, failpoint="get-utilization-http-handler"),
     ],
 )
-def test_sharding_service_heartbeats(
+def test_storage_controller_heartbeats(
     neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, failure: Failure
 ):
     neon_env_builder.num_pageservers = 2
@@ -976,7 +986,7 @@ def test_sharding_service_heartbeats(
     wait_until(10, 1, storage_controller_consistent)
 
 
-def test_sharding_service_re_attach(neon_env_builder: NeonEnvBuilder):
+def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder):
     """
     Exercise the behavior of the /re-attach endpoint on pageserver startup when
     pageservers have a mixture of attached and secondary locations
@@ -1131,3 +1141,99 @@ def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBui
 
     # And indeed the tenant should be attached
     assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1
+
+
+def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
+    """
+    The storage controller command line interface (storcon-cli) is an internal tool.  Most tests
+    just use the APIs directly: this test exercises some basics of the CLI as a regression test
+    that the client remains usable as the server evolves.
+    """
+    output_dir = neon_env_builder.test_output_dir
+    shard_count = 4
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    base_args = [env.neon_binpath / "storcon_cli", "--api", env.storage_controller_api]
+
+    def storcon_cli(args):
+        """
+        CLI wrapper: returns stdout split into a list of non-empty strings
+        """
+        (output_path, stdout, status_code) = subprocess_capture(
+            output_dir,
+            [str(s) for s in base_args + args],
+            echo_stderr=True,
+            echo_stdout=True,
+            env={},
+            check=False,
+            capture_stdout=True,
+            timeout=10,
+        )
+        if status_code:
+            log.warning(f"Command {args} failed")
+            log.warning(f"Output at: {output_path}")
+
+            raise RuntimeError("CLI failure (check logs for stderr)")
+
+        assert stdout is not None
+        return [line.strip() for line in stdout.split("\n") if line.strip()]
+
+    # List nodes
+    node_lines = storcon_cli(["nodes"])
+    # Table header, footer, and one line of data
+    assert len(node_lines) == 5
+    assert "localhost" in node_lines[3]
+
+    # Pause scheduling onto a node
+    storcon_cli(["node-configure", "--node-id", "1", "--scheduling", "pause"])
+    assert "Pause" in storcon_cli(["nodes"])[3]
+
+    # We will simulate a node death and then marking it offline
+    env.pageservers[0].stop(immediate=True)
+    # Sleep to make it unlikely that the controller's heartbeater will race handling
+    # a /utilization response internally, such that it marks the node back online.  IRL
+    # there would always be a longer delay than this before a node failing and a human
+    # intervening.
+    time.sleep(2)
+
+    storcon_cli(["node-configure", "--node-id", "1", "--availability", "offline"])
+    assert "Offline" in storcon_cli(["nodes"])[3]
+
+    # List tenants
+    tenant_lines = storcon_cli(["tenants"])
+    assert len(tenant_lines) == 5
+    assert str(env.initial_tenant) in tenant_lines[3]
+
+    # Setting scheduling policies intentionally result in warnings, they're for rare use.
+    env.storage_controller.allowed_errors.extend(
+        [".*Skipping reconcile for policy.*", ".*Scheduling is disabled by policy.*"]
+    )
+
+    # Describe a tenant
+    tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)])
+    assert len(tenant_lines) == 3 + shard_count * 2
+    assert str(env.initial_tenant) in tenant_lines[3]
+
+    # Pause changes on a tenant
+    storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"])
+    assert "Stop" in storcon_cli(["tenants"])[3]
+
+    # Change a tenant's placement
+    storcon_cli(
+        ["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"]
+    )
+    assert "Secondary" in storcon_cli(["tenants"])[3]
+
+    # Modify a tenant's config
+    storcon_cli(
+        [
+            "tenant-config",
+            "--tenant-id",
+            str(env.initial_tenant),
+            "--config",
+            json.dumps({"pitr_interval": "1m"}),
+        ]
+    )
+
+    # Quiesce any background reconciliation before doing consistency check
+    env.storage_controller.reconcile_until_idle(timeout_secs=10)
+    env.storage_controller.consistency_check()
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 025cc930d7..4c8fd4b0e5 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -20,9 +20,10 @@ from fixtures.pg_version import PgVersion
 from fixtures.types import Lsn, TenantId, TimelineId
 
 
-@pytest.mark.xfail
-def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path):
-    env = neon_simple_env
+def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_configs()
+    env.start()
+
     (tenant_id, _) = env.neon_cli.create_tenant()
     http_client = env.pageserver.http_client()
     initial_size = http_client.tenant_size(tenant_id)
@@ -35,66 +36,25 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path):
     branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0]
     assert branch_name == main_branch_name
 
-    with env.endpoints.create_start(
+    endpoint = env.endpoints.create_start(
         main_branch_name,
         tenant_id=tenant_id,
         config_lines=["autovacuum=off", "checkpoint_timeout=10min"],
-    ) as endpoint:
-        with endpoint.cursor() as cur:
-            cur.execute("SELECT 1")
-            row = cur.fetchone()
-            assert row is not None
-            assert row[0] == 1
-        size = http_client.tenant_size(tenant_id)
-        # we've disabled the autovacuum and checkpoint
-        # so background processes should not change the size.
-        # If this test will flake we should probably loosen the check
-        assert (
-            size == initial_size
-        ), f"starting idle compute should not change the tenant size (Currently {size}, expected {initial_size})"
+    )
 
-    # the size should be the same, until we increase the size over the
-    # gc_horizon
-    size, inputs = http_client.tenant_size_and_modelinputs(tenant_id)
-    assert (
-        size == initial_size
-    ), f"tenant_size should not be affected by shutdown of compute (Currently {size}, expected {initial_size})"
+    with endpoint.cursor() as cur:
+        cur.execute("SELECT 1")
+        row = cur.fetchone()
+        assert row is not None
+        assert row[0] == 1
 
-    expected_inputs = {
-        "segments": [
-            {
-                "segment": {"parent": None, "lsn": 23694408, "size": 25362432, "needed": True},
-                "timeline_id": f"{main_timeline_id}",
-                "kind": "BranchStart",
-            },
-            {
-                "segment": {"parent": 0, "lsn": 23694528, "size": None, "needed": True},
-                "timeline_id": f"{main_timeline_id}",
-                "kind": "BranchEnd",
-            },
-        ],
-        "timeline_inputs": [
-            {
-                "timeline_id": f"{main_timeline_id}",
-                "ancestor_id": None,
-                "ancestor_lsn": "0/0",
-                "last_record": "0/1698CC0",
-                "latest_gc_cutoff": "0/1698C48",
-                "horizon_cutoff": "0/0",
-                "pitr_cutoff": "0/0",
-                "next_gc_cutoff": "0/0",
-                "retention_param_cutoff": None,
-            }
-        ],
-    }
-    expected_inputs = mask_model_inputs(expected_inputs)
-    actual_inputs = mask_model_inputs(inputs)
+    # The transaction above will make the compute generate a checkpoint.
+    # In turn, the pageserver persists the checkpoint. This should only be
+    # one key with a size of a couple hundred bytes.
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, main_timeline_id)
+    size = http_client.tenant_size(tenant_id)
 
-    assert expected_inputs == actual_inputs
-
-    size_debug_file = open(test_output_dir / "size_debug.html", "w")
-    size_debug = http_client.tenant_size_debug(tenant_id)
-    size_debug_file.write(size_debug)
+    assert size >= initial_size and size - initial_size < 1024
 
 
 def test_branched_empty_timeline_size(neon_simple_env: NeonEnv, test_output_dir: Path):
@@ -190,7 +150,6 @@ def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv, test_ou
     size_debug_file.write(size_debug)
 
 
-@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
 def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
     """
     gc_horizon = 15
@@ -233,7 +192,6 @@ def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir:
     size_debug_file.write(size_debug)
 
 
-@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
 def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
     """
     gc_horizon = 5
@@ -282,7 +240,6 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
     size_debug_file.write(size_debug)
 
 
-@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
 def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
     """
     gc_horizon = small
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 2cac58dc1a..ac1a747df3 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -103,9 +103,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder):
 
     n_timelines = 3
 
-    branch_names = [
-        "test_safekeepers_many_timelines_{}".format(tlin) for tlin in range(n_timelines)
-    ]
+    branch_names = [f"test_safekeepers_many_timelines_{tlin}" for tlin in range(n_timelines)]
     # pageserver, safekeeper operate timelines via their ids (can be represented in hex as 'ad50847381e248feaac9876cc71ae418')
     # that's not really human readable, so the branch names are introduced in Neon CLI.
     # Neon CLI stores its branch <-> timeline mapping in its internals,
@@ -1136,13 +1134,13 @@ def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: Timeline
         for f in mismatch:
             f1 = os.path.join(sk0.timeline_dir(tenant_id, timeline_id), f)
             f2 = os.path.join(sk.timeline_dir(tenant_id, timeline_id), f)
-            stdout_filename = "{}.filediff".format(f2)
+            stdout_filename = f"{f2}.filediff"
 
             with open(stdout_filename, "w") as stdout_f:
-                subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True)
-                subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True)
+                subprocess.run(f"xxd {f1} > {f1}.hex ", shell=True)
+                subprocess.run(f"xxd {f2} > {f2}.hex ", shell=True)
 
-                cmd = "diff {}.hex {}.hex".format(f1, f2)
+                cmd = f"diff {f1}.hex {f2}.hex"
                 subprocess.run([cmd], stdout=stdout_f, shell=True)
 
             assert (mismatch, not_regular) == (
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index 720633189e..5902eb3217 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -10,6 +10,7 @@ import pytest
 import toml
 from fixtures.log_helper import getLogger
 from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper
+from fixtures.remote_storage import RemoteStorageKind
 from fixtures.types import Lsn, TenantId, TimelineId
 
 log = getLogger("root.safekeeper_async")
@@ -76,20 +77,20 @@ class WorkerStats(object):
         self.counters[worker_id] += 1
 
     def check_progress(self):
-        log.debug("Workers progress: {}".format(self.counters))
+        log.debug(f"Workers progress: {self.counters}")
 
         # every worker should finish at least one tx
         assert all(cnt > 0 for cnt in self.counters)
 
         progress = sum(self.counters)
-        log.info("All workers made {} transactions".format(progress))
+        log.info(f"All workers made {progress} transactions")
 
 
 async def run_random_worker(
     stats: WorkerStats, endpoint: Endpoint, worker_id, n_accounts, max_transfer
 ):
     pg_conn = await endpoint.connect_async()
-    log.debug("Started worker {}".format(worker_id))
+    log.debug(f"Started worker {worker_id}")
 
     while stats.running:
         from_uid = random.randint(0, n_accounts - 1)
@@ -99,9 +100,9 @@ async def run_random_worker(
         await bank_transfer(pg_conn, from_uid, to_uid, amount)
         stats.inc_progress(worker_id)
 
-        log.debug("Executed transfer({}) {} => {}".format(amount, from_uid, to_uid))
+        log.debug(f"Executed transfer({amount}) {from_uid} => {to_uid}")
 
-    log.debug("Finished worker {}".format(worker_id))
+    log.debug(f"Finished worker {worker_id}")
 
     await pg_conn.close()
 
@@ -199,7 +200,9 @@ async def run_restarts_under_load(
         # assert that at least one transaction has completed in every worker
         stats.check_progress()
 
-        victim.start()
+        # testing #6530, temporary here
+        # TODO: remove afer partial backup is enabled by default
+        victim.start(extra_opts=["--partial-backup-enabled", "--partial-backup-timeout=2s"])
 
     log.info("Iterations are finished, exiting coroutines...")
     stats.running = False
@@ -213,6 +216,7 @@ async def run_restarts_under_load(
 # Restart acceptors one by one, while executing and validating bank transactions
 def test_restarts_under_load(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS)
     env = neon_env_builder.init_start()
 
     env.neon_cli.create_branch("test_safekeepers_restarts_under_load")