Remove assert for non-zero number of shards from AssignPageserverConnstring

Bump postgres version
Bump Postgres version
2026-05-13 03:00:37 +00:00 · 2023-12-20 09:16:54 +02:00 · 2023-12-19 22:17:07 +02:00 · 2023-12-19 16:21:15 +02:00 · 2023-12-19 14:45:25 +02:00 · 2023-12-19 10:20:56 +02:00
128 changed files with 5433 additions and 2590 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -233,7 +233,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]

 [[package]]
@@ -244,7 +244,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]

 [[package]]
@@ -881,7 +881,7 @@ dependencies = [
 "regex",
 "rustc-hash",
 "shlex",
- "syn 2.0.28",
+ "syn 2.0.32",
 "which",
 ]

@@ -1095,7 +1095,7 @@ dependencies = [
 "heck",
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]

 [[package]]
@@ -1245,16 +1245,19 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-trait",
 "camino",
 "clap",
 "comfy-table",
 "compute_api",
+ "futures",
 "git-version",
 "hex",
 "hyper",
 "nix 0.26.2",
 "once_cell",
 "pageserver_api",
+ "pageserver_client",
 "postgres",
 "postgres_backend",
 "postgres_connection",
@@ -1268,6 +1271,8 @@ dependencies = [
 "tar",
 "thiserror",
 "tokio",
+ "tokio-postgres",
+ "tokio-util",
 "toml",
 "tracing",
 "url",
@@ -1481,7 +1486,7 @@ dependencies = [
 "proc-macro2",
 "quote",
 "strsim",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]

 [[package]]
@@ -1492,7 +1497,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
 dependencies = [
 "darling_core",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]

 [[package]]
@@ -1567,7 +1572,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]

 [[package]]
@@ -1661,7 +1666,7 @@ dependencies = [
 "darling",
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]

 [[package]]
@@ -1915,7 +1920,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]

 [[package]]
@@ -2901,7 +2906,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]

 [[package]]
@@ -3103,6 +3108,7 @@ dependencies = [
 "humantime-serde",
 "hyper",
 "itertools",
+ "md5",
 "metrics",
 "nix 0.26.2",
 "num-traits",
@@ -3139,6 +3145,7 @@ dependencies = [
 "tokio",
 "tokio-io-timeout",
 "tokio-postgres",
+ "tokio-stream",
 "tokio-tar",
 "tokio-util",
 "toml_edit",
@@ -3161,6 +3168,7 @@ dependencies = [
 "enum-map",
 "hex",
 "postgres_ffi",
+ "rand 0.8.5",
 "serde",
 "serde_json",
 "serde_with",
@@ -3171,6 +3179,27 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "pageserver_client"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "bytes",
+ "futures",
+ "pageserver_api",
+ "postgres",
+ "reqwest",
+ "serde",
+ "thiserror",
+ "tokio",
+ "tokio-postgres",
+ "tokio-stream",
+ "tokio-util",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "parking"
 version = "2.1.1"
@@ -3330,7 +3359,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]

 [[package]]
@@ -3537,7 +3566,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
 dependencies = [
 "proc-macro2",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]

 [[package]]
@@ -4145,7 +4174,7 @@ dependencies = [
 "regex",
 "relative-path",
 "rustc_version",
- "syn 2.0.28",
+ "syn 2.0.32",
 "unicode-ident",
 ]

@@ -4291,6 +4320,7 @@ dependencies = [
 "histogram",
 "itertools",
 "pageserver",
+ "pageserver_api",
 "rand 0.8.5",
 "remote_storage",
 "reqwest",
@@ -4579,7 +4609,7 @@ checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]

 [[package]]
@@ -4660,7 +4690,7 @@ dependencies = [
 "darling",
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]

 [[package]]
@@ -4927,9 +4957,9 @@ dependencies = [

 [[package]]
 name = "syn"
-version = "2.0.28"
+version = "2.0.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567"
+checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -5059,7 +5089,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]

 [[package]]
@@ -5177,7 +5207,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]

 [[package]]
@@ -5478,7 +5508,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]

 [[package]]
@@ -5764,6 +5794,7 @@ dependencies = [
 "serde",
 "serde_assert",
 "serde_json",
+ "serde_path_to_error",
 "serde_with",
 "signal-hook",
 "strum",
@@ -5922,7 +5953,7 @@ dependencies = [
 "once_cell",
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 "wasm-bindgen-shared",
 ]

@@ -5956,7 +5987,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 "wasm-bindgen-backend",
 "wasm-bindgen-shared",
 ]
@@ -6293,7 +6324,7 @@ dependencies = [
 "smallvec",
 "subtle",
 "syn 1.0.109",
- "syn 2.0.28",
+ "syn 2.0.32",
 "time",
 "time-macros",
 "tokio",
@@ -6355,22 +6386,22 @@ dependencies = [

 [[package]]
 name = "zerocopy"
-version = "0.7.3"
+version = "0.7.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a7af71d8643341260a65f89fa60c0eeaa907f34544d8f6d9b0df72f069b5e74"
+checksum = "1c4061bedbb353041c12f413700357bec76df2c7e2ca8e4df8bac24c6bf68e3d"
 dependencies = [
 "zerocopy-derive",
 ]

 [[package]]
 name = "zerocopy-derive"
-version = "0.7.3"
+version = "0.7.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9731702e2f0617ad526794ae28fbc6f6ca8849b5ba729666c2a5bc4b6ddee2cd"
+checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.28",
+ "syn 2.0.32",
 ]

 [[package]]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,6 +5,7 @@ members = [
    "control_plane",
    "pageserver",
    "pageserver/ctl",
+    "pageserver/client",
    "proxy",
    "safekeeper",
    "storage_broker",
@@ -182,6 +183,7 @@ compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
+pageserver_client = { path = "./pageserver/client" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
--- a/README.md
+++ b/README.md
@@ -29,13 +29,14 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
 libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
-libcurl4-openssl-dev openssl python-poetry lsof libicu-dev
+libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
-  protobuf-devel libcurl-devel openssl poetry lsof libicu-devel
+  protobuf-devel libcurl-devel openssl poetry lsof libicu-devel libpq-devel python3-devel \
+  libffi-devel
 ```
 * On Arch based systems, these packages are needed:
 ```bash
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -298,7 +298,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
                // from neon_superuser.
                let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -6,9 +6,11 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
+async-trait.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
+futures.workspace = true
 git-version.workspace = true
 nix.workspace = true
 once_cell.workspace = true
@@ -24,10 +26,11 @@ tar.workspace = true
 thiserror.workspace = true
 toml.workspace = true
 tokio.workspace = true
+tokio-postgres.workspace = true
+tokio-util.workspace = true
 url.workspace = true
-# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
-# instead, so that recompile times are better.
 pageserver_api.workspace = true
+pageserver_client.workspace = true
 postgres_backend.workspace = true
 safekeeper_api.workspace = true
 postgres_connection.workspace = true
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -9,7 +9,7 @@ pub struct AttachmentService {
    env: LocalEnv,
    listen: String,
    path: PathBuf,
-    client: reqwest::blocking::Client,
+    client: reqwest::Client,
 }

 const COMMAND: &str = "attachment_service";
@@ -53,7 +53,7 @@ impl AttachmentService {
            env: env.clone(),
            path,
            listen,
-            client: reqwest::blocking::ClientBuilder::new()
+            client: reqwest::ClientBuilder::new()
                .build()
                .expect("Failed to construct http client"),
        }
@@ -64,7 +64,7 @@ impl AttachmentService {
            .expect("non-Unicode path")
    }

-    pub fn start(&self) -> anyhow::Result<Child> {
+    pub async fn start(&self) -> anyhow::Result<Child> {
        let path_str = self.path.to_string_lossy();

        background_process::start_process(
@@ -73,10 +73,11 @@ impl AttachmentService {
            &self.env.attachment_service_bin(),
            ["-l", &self.listen, "-p", &path_str],
            [],
-            background_process::InitialPidFile::Create(&self.pid_file()),
+            background_process::InitialPidFile::Create(self.pid_file()),
            // TODO: a real status check
-            || Ok(true),
+            || async move { anyhow::Ok(true) },
        )
+        .await
    }

    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
@@ -84,7 +85,7 @@ impl AttachmentService {
    }

    /// Call into the attach_hook API, for use before handing out attachments to pageservers
-    pub fn attach_hook(
+    pub async fn attach_hook(
        &self,
        tenant_id: TenantId,
        pageserver_id: NodeId,
@@ -104,16 +105,16 @@ impl AttachmentService {
            node_id: Some(pageserver_id),
        };

-        let response = self.client.post(url).json(&request).send()?;
+        let response = self.client.post(url).json(&request).send().await?;
        if response.status() != StatusCode::OK {
            return Err(anyhow!("Unexpected status {}", response.status()));
        }

-        let response = response.json::<AttachHookResponse>()?;
+        let response = response.json::<AttachHookResponse>().await?;
        Ok(response.gen)
    }

-    pub fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
+    pub async fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
        use hyper::StatusCode;

        let url = self
@@ -126,12 +127,12 @@ impl AttachmentService {

        let request = InspectRequest { tenant_id };

-        let response = self.client.post(url).json(&request).send()?;
+        let response = self.client.post(url).json(&request).send().await?;
        if response.status() != StatusCode::OK {
            return Err(anyhow!("Unexpected status {}", response.status()));
        }

-        let response = response.json::<InspectResponse>()?;
+        let response = response.json::<InspectResponse>().await?;
        Ok(response.attachment)
    }
 }
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -44,15 +44,15 @@ const NOTICE_AFTER_RETRIES: u64 = 50;

 /// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
 /// it itself.
-pub enum InitialPidFile<'t> {
+pub enum InitialPidFile {
    /// Create a pidfile, to allow future CLI invocations to manipulate the process.
-    Create(&'t Utf8Path),
+    Create(Utf8PathBuf),
    /// The process will create the pidfile itself, need to wait for that event.
-    Expect(&'t Utf8Path),
+    Expect(Utf8PathBuf),
 }

 /// Start a background child process using the parameters given.
-pub fn start_process<F, AI, A, EI>(
+pub async fn start_process<F, Fut, AI, A, EI>(
    process_name: &str,
    datadir: &Path,
    command: &Path,
@@ -62,7 +62,8 @@ pub fn start_process<F, AI, A, EI>(
    process_status_check: F,
 ) -> anyhow::Result<Child>
 where
-    F: Fn() -> anyhow::Result<bool>,
+    F: Fn() -> Fut,
+    Fut: std::future::Future<Output = anyhow::Result<bool>>,
    AI: IntoIterator<Item = A>,
    A: AsRef<OsStr>,
    // Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
@@ -89,7 +90,7 @@ where
    let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
    filled_cmd.envs(envs);

-    let pid_file_to_check = match initial_pid_file {
+    let pid_file_to_check = match &initial_pid_file {
        InitialPidFile::Create(path) => {
            pre_exec_create_pidfile(filled_cmd, path);
            path
@@ -107,7 +108,7 @@ where
    );

    for retries in 0..RETRIES {
-        match process_started(pid, Some(pid_file_to_check), &process_status_check) {
+        match process_started(pid, pid_file_to_check, &process_status_check).await {
            Ok(true) => {
                println!("\n{process_name} started, pid: {pid}");
                return Ok(spawned_process);
@@ -316,22 +317,20 @@ where
    cmd
 }

-fn process_started<F>(
+async fn process_started<F, Fut>(
    pid: Pid,
-    pid_file_to_check: Option<&Utf8Path>,
+    pid_file_to_check: &Utf8Path,
    status_check: &F,
 ) -> anyhow::Result<bool>
 where
-    F: Fn() -> anyhow::Result<bool>,
+    F: Fn() -> Fut,
+    Fut: std::future::Future<Output = anyhow::Result<bool>>,
 {
-    match status_check() {
-        Ok(true) => match pid_file_to_check {
-            Some(pid_file_path) => match pid_file::read(pid_file_path)? {
-                PidFileRead::NotExist => Ok(false),
-                PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
-                PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
-            },
-            None => Ok(true),
+    match status_check().await {
+        Ok(true) => match pid_file::read(pid_file_to_check)? {
+            PidFileRead::NotExist => Ok(false),
+            PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
+            PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
        },
        Ok(false) => Ok(false),
        Err(e) => anyhow::bail!("process failed to start: {e}"),
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -120,15 +120,20 @@ fn main() -> Result<()> {
        let mut env = LocalEnv::load_config().context("Error loading config")?;
        let original_env = env.clone();

+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .unwrap();
+
        let subcommand_result = match sub_name {
-            "tenant" => handle_tenant(sub_args, &mut env),
-            "timeline" => handle_timeline(sub_args, &mut env),
-            "start" => handle_start_all(sub_args, &env),
+            "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
+            "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
+            "start" => rt.block_on(handle_start_all(sub_args, &env)),
            "stop" => handle_stop_all(sub_args, &env),
-            "pageserver" => handle_pageserver(sub_args, &env),
-            "attachment_service" => handle_attachment_service(sub_args, &env),
-            "safekeeper" => handle_safekeeper(sub_args, &env),
-            "endpoint" => handle_endpoint(sub_args, &env),
+            "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
+            "attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
+            "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
+            "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
            "mappings" => handle_mappings(sub_args, &mut env),
            "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
            _ => bail!("unexpected subcommand {sub_name}"),
@@ -269,12 +274,13 @@ fn print_timeline(

 /// Returns a map of timeline IDs to timeline_id@lsn strings.
 /// Connects to the pageserver to query this information.
-fn get_timeline_infos(
+async fn get_timeline_infos(
    env: &local_env::LocalEnv,
    tenant_id: &TenantId,
 ) -> Result<HashMap<TimelineId, TimelineInfo>> {
    Ok(get_default_pageserver(env)
-        .timeline_list(tenant_id)?
+        .timeline_list(tenant_id)
+        .await?
        .into_iter()
        .map(|timeline_info| (timeline_info.timeline_id, timeline_info))
        .collect())
@@ -373,11 +379,14 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
        .collect()
 }

-fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> {
+async fn handle_tenant(
+    tenant_match: &ArgMatches,
+    env: &mut local_env::LocalEnv,
+) -> anyhow::Result<()> {
    let pageserver = get_default_pageserver(env);
    match tenant_match.subcommand() {
        Some(("list", _)) => {
-            for t in pageserver.tenant_list()? {
+            for t in pageserver.tenant_list().await? {
                println!("{} {:?}", t.id, t.state);
            }
        }
@@ -394,12 +403,16 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                // We must register the tenant with the attachment service, so
                // that when the pageserver restarts, it will be re-attached.
                let attachment_service = AttachmentService::from_env(env);
-                attachment_service.attach_hook(tenant_id, pageserver.conf.id)?
+                attachment_service
+                    .attach_hook(tenant_id, pageserver.conf.id)
+                    .await?
            } else {
                None
            };

-            pageserver.tenant_create(tenant_id, generation, tenant_conf)?;
+            pageserver
+                .tenant_create(tenant_id, generation, tenant_conf)
+                .await?;
            println!("tenant {tenant_id} successfully created on the pageserver");

            // Create an initial timeline for the new tenant
@@ -409,14 +422,16 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                .copied()
                .context("Failed to parse postgres version from the argument string")?;

-            let timeline_info = pageserver.timeline_create(
-                tenant_id,
-                new_timeline_id,
-                None,
-                None,
-                Some(pg_version),
-                None,
-            )?;
+            let timeline_info = pageserver
+                .timeline_create(
+                    tenant_id,
+                    new_timeline_id,
+                    None,
+                    None,
+                    Some(pg_version),
+                    None,
+                )
+                .await?;
            let new_timeline_id = timeline_info.timeline_id;
            let last_record_lsn = timeline_info.last_record_lsn;

@@ -450,6 +465,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an

            pageserver
                .tenant_config(tenant_id, tenant_conf)
+                .await
                .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
            println!("tenant {tenant_id} successfully configured on the pageserver");
        }
@@ -458,7 +474,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
            let new_pageserver = get_pageserver(env, matches)?;
            let new_pageserver_id = new_pageserver.conf.id;

-            migrate_tenant(env, tenant_id, new_pageserver)?;
+            migrate_tenant(env, tenant_id, new_pageserver).await?;
            println!("tenant {tenant_id} migrated to {}", new_pageserver_id);
        }

@@ -468,13 +484,13 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
    Ok(())
 }

-fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
+async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
    let pageserver = get_default_pageserver(env);

    match timeline_match.subcommand() {
        Some(("list", list_match)) => {
            let tenant_id = get_tenant_id(list_match, env)?;
-            let timelines = pageserver.timeline_list(&tenant_id)?;
+            let timelines = pageserver.timeline_list(&tenant_id).await?;
            print_timelines_tree(timelines, env.timeline_name_mappings())?;
        }
        Some(("create", create_match)) => {
@@ -490,14 +506,16 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -

            let new_timeline_id_opt = parse_timeline_id(create_match)?;

-            let timeline_info = pageserver.timeline_create(
-                tenant_id,
-                new_timeline_id_opt,
-                None,
-                None,
-                Some(pg_version),
-                None,
-            )?;
+            let timeline_info = pageserver
+                .timeline_create(
+                    tenant_id,
+                    new_timeline_id_opt,
+                    None,
+                    None,
+                    Some(pg_version),
+                    None,
+                )
+                .await?;
            let new_timeline_id = timeline_info.timeline_id;

            let last_record_lsn = timeline_info.last_record_lsn;
@@ -542,7 +560,9 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -

            let mut cplane = ComputeControlPlane::load(env.clone())?;
            println!("Importing timeline into pageserver ...");
-            pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)?;
+            pageserver
+                .timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)
+                .await?;
            env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?;

            println!("Creating endpoint for imported timeline ...");
@@ -578,14 +598,16 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                .map(|lsn_str| Lsn::from_str(lsn_str))
                .transpose()
                .context("Failed to parse ancestor start Lsn from the request")?;
-            let timeline_info = pageserver.timeline_create(
-                tenant_id,
-                None,
-                start_lsn,
-                Some(ancestor_timeline_id),
-                None,
-                None,
-            )?;
+            let timeline_info = pageserver
+                .timeline_create(
+                    tenant_id,
+                    None,
+                    start_lsn,
+                    Some(ancestor_timeline_id),
+                    None,
+                    None,
+                )
+                .await?;
            let new_timeline_id = timeline_info.timeline_id;

            let last_record_lsn = timeline_info.last_record_lsn;
@@ -604,7 +626,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
    Ok(())
 }

-fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    let (sub_name, sub_args) = match ep_match.subcommand() {
        Some(ep_subcommand_data) => ep_subcommand_data,
        None => bail!("no endpoint subcommand provided"),
@@ -614,10 +636,12 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
    match sub_name {
        "list" => {
            let tenant_id = get_tenant_id(sub_args, env)?;
-            let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| {
-                eprintln!("Failed to load timeline info: {}", e);
-                HashMap::new()
-            });
+            let timeline_infos = get_timeline_infos(env, &tenant_id)
+                .await
+                .unwrap_or_else(|e| {
+                    eprintln!("Failed to load timeline info: {}", e);
+                    HashMap::new()
+                });

            let timeline_name_mappings = env.timeline_name_mappings();

@@ -791,7 +815,9 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
            };

            println!("Starting existing endpoint {endpoint_id}...");
-            endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
+            endpoint
+                .start(&auth_token, safekeepers, remote_ext_config)
+                .await?;
        }
        "reconfigure" => {
            let endpoint_id = sub_args
@@ -809,7 +835,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                } else {
                    None
                };
-            endpoint.reconfigure(pageserver_id)?;
+            endpoint.reconfigure(pageserver_id).await?;
        }
        "stop" => {
            let endpoint_id = sub_args
@@ -875,11 +901,12 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
    ))
 }

-fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    match sub_match.subcommand() {
        Some(("start", subcommand_args)) => {
            if let Err(e) = get_pageserver(env, subcommand_args)?
                .start(&pageserver_config_overrides(subcommand_args))
+                .await
            {
                eprintln!("pageserver start failed: {e}");
                exit(1);
@@ -906,7 +933,10 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
                exit(1);
            }

-            if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
+            if let Err(e) = pageserver
+                .start(&pageserver_config_overrides(subcommand_args))
+                .await
+            {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
@@ -920,14 +950,17 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
                exit(1);
            }

-            if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
+            if let Err(e) = pageserver
+                .start(&pageserver_config_overrides(subcommand_args))
+                .await
+            {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
        }

        Some(("status", subcommand_args)) => {
-            match get_pageserver(env, subcommand_args)?.check_status() {
+            match get_pageserver(env, subcommand_args)?.check_status().await {
                Ok(_) => println!("Page server is up and running"),
                Err(err) => {
                    eprintln!("Page server is not available: {}", err);
@@ -942,11 +975,14 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
    Ok(())
 }

-fn handle_attachment_service(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+async fn handle_attachment_service(
+    sub_match: &ArgMatches,
+    env: &local_env::LocalEnv,
+) -> Result<()> {
    let svc = AttachmentService::from_env(env);
    match sub_match.subcommand() {
        Some(("start", _start_match)) => {
-            if let Err(e) = svc.start() {
+            if let Err(e) = svc.start().await {
                eprintln!("start failed: {e}");
                exit(1);
            }
@@ -987,7 +1023,7 @@ fn safekeeper_extra_opts(init_match: &ArgMatches) -> Vec<String> {
        .collect()
 }

-fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    let (sub_name, sub_args) = match sub_match.subcommand() {
        Some(safekeeper_command_data) => safekeeper_command_data,
        None => bail!("no safekeeper subcommand provided"),
@@ -1005,7 +1041,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
        "start" => {
            let extra_opts = safekeeper_extra_opts(sub_args);

-            if let Err(e) = safekeeper.start(extra_opts) {
+            if let Err(e) = safekeeper.start(extra_opts).await {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -1031,7 +1067,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
            }

            let extra_opts = safekeeper_extra_opts(sub_args);
-            if let Err(e) = safekeeper.start(extra_opts) {
+            if let Err(e) = safekeeper.start(extra_opts).await {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -1044,15 +1080,15 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
    Ok(())
 }

-fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
+async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
    // Endpoints are not started automatically

-    broker::start_broker_process(env)?;
+    broker::start_broker_process(env).await?;

    // Only start the attachment service if the pageserver is configured to need it
    if env.control_plane_api.is_some() {
        let attachment_service = AttachmentService::from_env(env);
-        if let Err(e) = attachment_service.start() {
+        if let Err(e) = attachment_service.start().await {
            eprintln!("attachment_service start failed: {:#}", e);
            try_stop_all(env, true);
            exit(1);
@@ -1061,7 +1097,10 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow

    for ps_conf in &env.pageservers {
        let pageserver = PageServerNode::from_env(env, ps_conf);
-        if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
+        if let Err(e) = pageserver
+            .start(&pageserver_config_overrides(sub_match))
+            .await
+        {
            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
            try_stop_all(env, true);
            exit(1);
@@ -1070,7 +1109,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow

    for node in env.safekeepers.iter() {
        let safekeeper = SafekeeperNode::from_env(env, node);
-        if let Err(e) = safekeeper.start(vec![]) {
+        if let Err(e) = safekeeper.start(vec![]).await {
            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
            try_stop_all(env, false);
            exit(1);
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -11,7 +11,7 @@ use camino::Utf8PathBuf;

 use crate::{background_process, local_env};

-pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
+pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
    let broker = &env.broker;
    let listen_addr = &broker.listen_addr;

@@ -19,15 +19,15 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {

    let args = [format!("--listen-addr={listen_addr}")];

-    let client = reqwest::blocking::Client::new();
+    let client = reqwest::Client::new();
    background_process::start_process(
        "storage_broker",
        &env.base_data_dir,
        &env.storage_broker_bin(),
        args,
        [],
-        background_process::InitialPidFile::Create(&storage_broker_pid_file_path(env)),
-        || {
+        background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)),
+        || async {
            let url = broker.client_url();
            let status_url = url.join("status").with_context(|| {
                format!("Failed to append /status path to broker endpoint {url}")
@@ -36,12 +36,13 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
                .get(status_url)
                .build()
                .with_context(|| format!("Failed to construct request to broker endpoint {url}"))?;
-            match client.execute(request) {
+            match client.execute(request).await {
                Ok(resp) => Ok(resp.status().is_success()),
                Err(_) => Ok(false),
            }
        },
    )
+    .await
    .context("Failed to spawn storage_broker subprocess")?;
    Ok(())
 }
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -464,7 +464,7 @@ impl Endpoint {
        }
    }

-    pub fn start(
+    pub async fn start(
        &self,
        auth_token: &Option<String>,
        safekeepers: Vec<NodeId>,
@@ -587,7 +587,7 @@ impl Endpoint {
        const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s
        loop {
            attempt += 1;
-            match self.get_status() {
+            match self.get_status().await {
                Ok(state) => {
                    match state.status {
                        ComputeStatus::Init => {
@@ -629,8 +629,8 @@ impl Endpoint {
    }

    // Call the /status HTTP API
-    pub fn get_status(&self) -> Result<ComputeState> {
-        let client = reqwest::blocking::Client::new();
+    pub async fn get_status(&self) -> Result<ComputeState> {
+        let client = reqwest::Client::new();

        let response = client
            .request(
@@ -641,16 +641,17 @@ impl Endpoint {
                    self.http_address.port()
                ),
            )
-            .send()?;
+            .send()
+            .await?;

        // Interpret the response
        let status = response.status();
        if !(status.is_client_error() || status.is_server_error()) {
-            Ok(response.json()?)
+            Ok(response.json().await?)
        } else {
            // reqwest does not export its error construction utility functions, so let's craft the message ourselves
            let url = response.url().to_owned();
-            let msg = match response.text() {
+            let msg = match response.text().await {
                Ok(err_body) => format!("Error: {}", err_body),
                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
            };
@@ -658,7 +659,7 @@ impl Endpoint {
        }
    }

-    pub fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
+    pub async fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
        let mut spec: ComputeSpec = {
            let spec_path = self.endpoint_path().join("spec.json");
            let file = std::fs::File::open(spec_path)?;
@@ -687,7 +688,7 @@ impl Endpoint {
            spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
        }

-        let client = reqwest::blocking::Client::new();
+        let client = reqwest::Client::new();
        let response = client
            .post(format!(
                "http://{}:{}/configure",
@@ -698,14 +699,15 @@ impl Endpoint {
                "{{\"spec\":{}}}",
                serde_json::to_string_pretty(&spec)?
            ))
-            .send()?;
+            .send()
+            .await?;

        let status = response.status();
        if !(status.is_client_error() || status.is_server_error()) {
            Ok(())
        } else {
            let url = response.url().to_owned();
-            let msg = match response.text() {
+            let msg = match response.text().await {
                Ok(err_body) => format!("Error: {}", err_body),
                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
            };
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -6,28 +6,24 @@
 //!
 use std::borrow::Cow;
 use std::collections::HashMap;
-use std::fs::File;
-use std::io::{BufReader, Write};
+
+use std::io;
+use std::io::Write;
 use std::num::NonZeroU64;
 use std::path::PathBuf;
 use std::process::{Child, Command};
 use std::time::Duration;
-use std::{io, result};

 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
-use pageserver_api::models::{
-    self, LocationConfig, TenantInfo, TenantLocationConfigRequest, TimelineInfo,
-};
+use futures::SinkExt;
+use pageserver_api::models::{self, LocationConfig, TenantInfo, TimelineInfo};
 use pageserver_api::shard::TenantShardId;
+use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
-use reqwest::blocking::{Client, RequestBuilder, Response};
-use reqwest::{IntoUrl, Method};
-use thiserror::Error;
 use utils::auth::{Claims, Scope};
 use utils::{
-    http::error::HttpErrorBody,
    id::{TenantId, TimelineId},
    lsn::Lsn,
 };
@@ -38,45 +34,6 @@ use crate::{background_process, local_env::LocalEnv};
 /// Directory within .neon which will be used by default for LocalFs remote storage.
 pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver";

-#[derive(Error, Debug)]
-pub enum PageserverHttpError {
-    #[error("Reqwest error: {0}")]
-    Transport(#[from] reqwest::Error),
-
-    #[error("Error: {0}")]
-    Response(String),
-}
-
-impl From<anyhow::Error> for PageserverHttpError {
-    fn from(e: anyhow::Error) -> Self {
-        Self::Response(e.to_string())
-    }
-}
-
-type Result<T> = result::Result<T, PageserverHttpError>;
-
-pub trait ResponseErrorMessageExt: Sized {
-    fn error_from_body(self) -> Result<Self>;
-}
-
-impl ResponseErrorMessageExt for Response {
-    fn error_from_body(self) -> Result<Self> {
-        let status = self.status();
-        if !(status.is_client_error() || status.is_server_error()) {
-            return Ok(self);
-        }
-
-        // reqwest does not export its error construction utility functions, so let's craft the message ourselves
-        let url = self.url().to_owned();
-        Err(PageserverHttpError::Response(
-            match self.json::<HttpErrorBody>() {
-                Ok(err_body) => format!("Error: {}", err_body.msg),
-                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
-            },
-        ))
-    }
-}
-
 //
 // Control routines for pageserver.
 //
@@ -87,8 +44,7 @@ pub struct PageServerNode {
    pub pg_connection_config: PgConnectionConfig,
    pub conf: PageServerConf,
    pub env: LocalEnv,
-    pub http_client: Client,
-    pub http_base_url: String,
+    pub http_client: mgmt_api::Client,
 }

 impl PageServerNode {
@@ -100,8 +56,19 @@ impl PageServerNode {
            pg_connection_config: PgConnectionConfig::new_host_port(host, port),
            conf: conf.clone(),
            env: env.clone(),
-            http_client: Client::new(),
-            http_base_url: format!("http://{}/v1", conf.listen_http_addr),
+            http_client: mgmt_api::Client::new(
+                format!("http://{}", conf.listen_http_addr),
+                {
+                    match conf.http_auth_type {
+                        AuthType::Trust => None,
+                        AuthType::NeonJWT => Some(
+                            env.generate_auth_token(&Claims::new(None, Scope::PageServerApi))
+                                .unwrap(),
+                        ),
+                    }
+                }
+                .as_deref(),
+            ),
        }
    }

@@ -182,8 +149,8 @@ impl PageServerNode {
            .expect("non-Unicode path")
    }

-    pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
-        self.start_node(config_overrides, false)
+    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
+        self.start_node(config_overrides, false).await
    }

    fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
@@ -224,7 +191,12 @@ impl PageServerNode {
        Ok(())
    }

-    fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result<Child> {
+    async fn start_node(
+        &self,
+        config_overrides: &[&str],
+        update_config: bool,
+    ) -> anyhow::Result<Child> {
+        // TODO: using a thread here because start_process() is not async but we need to call check_status()
        let datadir = self.repo_path();
        print!(
            "Starting pageserver node {} at '{}' in {:?}",
@@ -232,7 +204,7 @@ impl PageServerNode {
            self.pg_connection_config.raw_address(),
            datadir
        );
-        io::stdout().flush()?;
+        io::stdout().flush().context("flush stdout")?;

        let datadir_path_str = datadir.to_str().with_context(|| {
            format!(
@@ -244,20 +216,23 @@ impl PageServerNode {
        if update_config {
            args.push(Cow::Borrowed("--update-config"));
        }
-
        background_process::start_process(
            "pageserver",
            &datadir,
            &self.env.pageserver_bin(),
            args.iter().map(Cow::as_ref),
            self.pageserver_env_variables()?,
-            background_process::InitialPidFile::Expect(&self.pid_file()),
-            || match self.check_status() {
-                Ok(()) => Ok(true),
-                Err(PageserverHttpError::Transport(_)) => Ok(false),
-                Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
+            background_process::InitialPidFile::Expect(self.pid_file()),
+            || async {
+                let st = self.check_status().await;
+                match st {
+                    Ok(()) => Ok(true),
+                    Err(mgmt_api::Error::ReceiveBody(_)) => Ok(false),
+                    Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
+                }
            },
        )
+        .await
    }

    fn pageserver_basic_args<'a>(
@@ -303,7 +278,12 @@ impl PageServerNode {
        background_process::stop_process(immediate, "pageserver", &self.pid_file())
    }

-    pub fn page_server_psql_client(&self) -> anyhow::Result<postgres::Client> {
+    pub async fn page_server_psql_client(
+        &self,
+    ) -> anyhow::Result<(
+        tokio_postgres::Client,
+        tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
+    )> {
        let mut config = self.pg_connection_config.clone();
        if self.conf.pg_auth_type == AuthType::NeonJWT {
            let token = self
@@ -311,36 +291,18 @@ impl PageServerNode {
                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
            config = config.set_password(Some(token));
        }
-        Ok(config.connect_no_tls()?)
+        Ok(config.connect_no_tls().await?)
    }

-    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> anyhow::Result<RequestBuilder> {
-        let mut builder = self.http_client.request(method, url);
-        if self.conf.http_auth_type == AuthType::NeonJWT {
-            let token = self
-                .env
-                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
-            builder = builder.bearer_auth(token)
-        }
-        Ok(builder)
+    pub async fn check_status(&self) -> mgmt_api::Result<()> {
+        self.http_client.status().await
    }

-    pub fn check_status(&self) -> Result<()> {
-        self.http_request(Method::GET, format!("{}/status", self.http_base_url))?
-            .send()?
-            .error_from_body()?;
-        Ok(())
+    pub async fn tenant_list(&self) -> mgmt_api::Result<Vec<TenantInfo>> {
+        self.http_client.list_tenants().await
    }

-    pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
-        Ok(self
-            .http_request(Method::GET, format!("{}/tenant", self.http_base_url))?
-            .send()?
-            .error_from_body()?
-            .json()?)
-    }
-
-    pub fn tenant_create(
+    pub async fn tenant_create(
        &self,
        new_tenant_id: TenantId,
        generation: Option<u32>,
@@ -407,6 +369,7 @@ impl PageServerNode {
                .map(|x| x.parse::<bool>())
                .transpose()
                .context("Failed to parse 'gc_feedback' as bool")?,
+            heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
        };

        let request = models::TenantCreateRequest {
@@ -417,23 +380,10 @@ impl PageServerNode {
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
        }
-        self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))?
-            .json(&request)
-            .send()?
-            .error_from_body()?
-            .json::<Option<String>>()
-            .with_context(|| {
-                format!("Failed to parse tenant creation response for tenant id: {new_tenant_id:?}")
-            })?
-            .context("No tenant id was found in the tenant creation response")
-            .and_then(|tenant_id_string| {
-                tenant_id_string.parse().with_context(|| {
-                    format!("Failed to parse response string as tenant id: '{tenant_id_string}'")
-                })
-            })
+        Ok(self.http_client.tenant_create(&request).await?)
    }

-    pub fn tenant_config(
+    pub async fn tenant_config(
        &self,
        tenant_id: TenantId,
        mut settings: HashMap<&str, &str>,
@@ -504,6 +454,7 @@ impl PageServerNode {
                    .map(|x| x.parse::<bool>())
                    .transpose()
                    .context("Failed to parse 'gc_feedback' as bool")?,
+                heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
            }
        };

@@ -511,54 +462,30 @@ impl PageServerNode {
            bail!("Unrecognized tenant settings: {settings:?}")
        }

-        self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))?
-            .json(&models::TenantConfigRequest { tenant_id, config })
-            .send()?
-            .error_from_body()?;
+        self.http_client
+            .tenant_config(&models::TenantConfigRequest { tenant_id, config })
+            .await?;

        Ok(())
    }

-    pub fn location_config(
+    pub async fn location_config(
        &self,
        tenant_id: TenantId,
        config: LocationConfig,
        flush_ms: Option<Duration>,
    ) -> anyhow::Result<()> {
-        let req_body = TenantLocationConfigRequest { tenant_id, config };
-
-        let path = format!(
-            "{}/tenant/{}/location_config",
-            self.http_base_url, tenant_id
-        );
-        let path = if let Some(flush_ms) = flush_ms {
-            format!("{}?flush_ms={}", path, flush_ms.as_millis())
-        } else {
-            path
-        };
-
-        self.http_request(Method::PUT, path)?
-            .json(&req_body)
-            .send()?
-            .error_from_body()?;
-
-        Ok(())
+        Ok(self
+            .http_client
+            .location_config(tenant_id, config, flush_ms)
+            .await?)
    }

-    pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
-        let timeline_infos: Vec<TimelineInfo> = self
-            .http_request(
-                Method::GET,
-                format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
-            )?
-            .send()?
-            .error_from_body()?
-            .json()?;
-
-        Ok(timeline_infos)
+    pub async fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
+        Ok(self.http_client.list_timelines(*tenant_id).await?)
    }

-    pub fn timeline_create(
+    pub async fn timeline_create(
        &self,
        tenant_id: TenantId,
        new_timeline_id: Option<TimelineId>,
@@ -569,29 +496,14 @@ impl PageServerNode {
    ) -> anyhow::Result<TimelineInfo> {
        // If timeline ID was not specified, generate one
        let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
-
-        self.http_request(
-            Method::POST,
-            format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
-        )?
-        .json(&models::TimelineCreateRequest {
+        let req = models::TimelineCreateRequest {
            new_timeline_id,
            ancestor_start_lsn,
            ancestor_timeline_id,
            pg_version,
            existing_initdb_timeline_id,
-        })
-        .send()?
-        .error_from_body()?
-        .json::<Option<TimelineInfo>>()
-        .with_context(|| {
-            format!("Failed to parse timeline creation response for tenant id: {tenant_id}")
-        })?
-        .with_context(|| {
-            format!(
-                "No timeline id was found in the timeline creation response for tenant {tenant_id}"
-            )
-        })
+        };
+        Ok(self.http_client.timeline_create(tenant_id, &req).await?)
    }

    /// Import a basebackup prepared using either:
@@ -603,7 +515,7 @@ impl PageServerNode {
    /// * `timeline_id` - id to assign to imported timeline
    /// * `base` - (start lsn of basebackup, path to `base.tar` file)
    /// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`)
-    pub fn timeline_import(
+    pub async fn timeline_import(
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
@@ -611,36 +523,60 @@ impl PageServerNode {
        pg_wal: Option<(Lsn, PathBuf)>,
        pg_version: u32,
    ) -> anyhow::Result<()> {
-        let mut client = self.page_server_psql_client()?;
+        let (client, conn) = self.page_server_psql_client().await?;
+        // The connection object performs the actual communication with the database,
+        // so spawn it off to run on its own.
+        tokio::spawn(async move {
+            if let Err(e) = conn.await {
+                eprintln!("connection error: {}", e);
+            }
+        });
+        tokio::pin!(client);

        // Init base reader
        let (start_lsn, base_tarfile_path) = base;
-        let base_tarfile = File::open(base_tarfile_path)?;
-        let mut base_reader = BufReader::new(base_tarfile);
+        let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
+        let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);

        // Init wal reader if necessary
        let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
-            let wal_tarfile = File::open(wal_tarfile_path)?;
-            let wal_reader = BufReader::new(wal_tarfile);
+            let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
+            let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
            (end_lsn, Some(wal_reader))
        } else {
            (start_lsn, None)
        };

-        // Import base
-        let import_cmd = format!(
-            "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
-        );
-        let mut writer = client.copy_in(&import_cmd)?;
-        io::copy(&mut base_reader, &mut writer)?;
-        writer.finish()?;
+        let copy_in = |reader, cmd| {
+            let client = &client;
+            async move {
+                let writer = client.copy_in(&cmd).await?;
+                let writer = std::pin::pin!(writer);
+                let mut writer = writer.sink_map_err(|e| {
+                    std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
+                });
+                let mut reader = std::pin::pin!(reader);
+                writer.send_all(&mut reader).await?;
+                writer.into_inner().finish().await?;
+                anyhow::Ok(())
+            }
+        };

+        // Import base
+        copy_in(
+            base_tarfile,
+            format!(
+                "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
+            ),
+        )
+        .await?;
        // Import wal if necessary
-        if let Some(mut wal_reader) = wal_reader {
-            let import_cmd = format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}");
-            let mut writer = client.copy_in(&import_cmd)?;
-            io::copy(&mut wal_reader, &mut writer)?;
-            writer.finish()?;
+        if let Some(wal_reader) = wal_reader {
+            copy_in(
+                wal_reader,
+                format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
+            )
+            .await?;
        }

        Ok(())
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -13,7 +13,6 @@ use std::{io, result};
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use postgres_connection::PgConnectionConfig;
-use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
 use utils::{http::error::HttpErrorBody, id::NodeId};
@@ -34,12 +33,14 @@ pub enum SafekeeperHttpError {

 type Result<T> = result::Result<T, SafekeeperHttpError>;

+#[async_trait::async_trait]
 pub trait ResponseErrorMessageExt: Sized {
-    fn error_from_body(self) -> Result<Self>;
+    async fn error_from_body(self) -> Result<Self>;
 }

-impl ResponseErrorMessageExt for Response {
-    fn error_from_body(self) -> Result<Self> {
+#[async_trait::async_trait]
+impl ResponseErrorMessageExt for reqwest::Response {
+    async fn error_from_body(self) -> Result<Self> {
        let status = self.status();
        if !(status.is_client_error() || status.is_server_error()) {
            return Ok(self);
@@ -48,7 +49,7 @@ impl ResponseErrorMessageExt for Response {
        // reqwest does not export its error construction utility functions, so let's craft the message ourselves
        let url = self.url().to_owned();
        Err(SafekeeperHttpError::Response(
-            match self.json::<HttpErrorBody>() {
+            match self.json::<HttpErrorBody>().await {
                Ok(err_body) => format!("Error: {}", err_body.msg),
                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
            },
@@ -69,7 +70,7 @@ pub struct SafekeeperNode {

    pub pg_connection_config: PgConnectionConfig,
    pub env: LocalEnv,
-    pub http_client: Client,
+    pub http_client: reqwest::Client,
    pub http_base_url: String,
 }

@@ -80,7 +81,7 @@ impl SafekeeperNode {
            conf: conf.clone(),
            pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
            env: env.clone(),
-            http_client: Client::new(),
+            http_client: reqwest::Client::new(),
            http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
        }
    }
@@ -103,7 +104,7 @@ impl SafekeeperNode {
            .expect("non-Unicode path")
    }

-    pub fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
+    pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
        print!(
            "Starting safekeeper at '{}' in '{}'",
            self.pg_connection_config.raw_address(),
@@ -191,13 +192,16 @@ impl SafekeeperNode {
            &self.env.safekeeper_bin(),
            &args,
            [],
-            background_process::InitialPidFile::Expect(&self.pid_file()),
-            || match self.check_status() {
-                Ok(()) => Ok(true),
-                Err(SafekeeperHttpError::Transport(_)) => Ok(false),
-                Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
+            background_process::InitialPidFile::Expect(self.pid_file()),
+            || async {
+                match self.check_status().await {
+                    Ok(()) => Ok(true),
+                    Err(SafekeeperHttpError::Transport(_)) => Ok(false),
+                    Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
+                }
            },
        )
+        .await
    }

    ///
@@ -216,7 +220,7 @@ impl SafekeeperNode {
        )
    }

-    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
+    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> reqwest::RequestBuilder {
        // TODO: authentication
        //if self.env.auth_type == AuthType::NeonJWT {
        //    builder = builder.bearer_auth(&self.env.safekeeper_auth_token)
@@ -224,10 +228,12 @@ impl SafekeeperNode {
        self.http_client.request(method, url)
    }

-    pub fn check_status(&self) -> Result<()> {
+    pub async fn check_status(&self) -> Result<()> {
        self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
-            .send()?
-            .error_from_body()?;
+            .send()
+            .await?
+            .error_from_body()
+            .await?;
        Ok(())
    }
 }
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -19,11 +19,11 @@ use utils::{
 };

 /// Given an attached pageserver, retrieve the LSN for all timelines
-fn get_lsns(
+async fn get_lsns(
    tenant_id: TenantId,
    pageserver: &PageServerNode,
 ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
-    let timelines = pageserver.timeline_list(&tenant_id)?;
+    let timelines = pageserver.timeline_list(&tenant_id).await?;
    Ok(timelines
        .into_iter()
        .map(|t| (t.timeline_id, t.last_record_lsn))
@@ -32,13 +32,13 @@ fn get_lsns(

 /// Wait for the timeline LSNs on `pageserver` to catch up with or overtake
 /// `baseline`.
-fn await_lsn(
+async fn await_lsn(
    tenant_id: TenantId,
    pageserver: &PageServerNode,
    baseline: HashMap<TimelineId, Lsn>,
 ) -> anyhow::Result<()> {
    loop {
-        let latest = match get_lsns(tenant_id, pageserver) {
+        let latest = match get_lsns(tenant_id, pageserver).await {
            Ok(l) => l,
            Err(e) => {
                println!(
@@ -84,7 +84,7 @@ fn await_lsn(
 ///  - Coordinate attach/secondary/detach on pageservers
 ///  - call into attachment_service for generations
 ///  - reconfigure compute endpoints to point to new attached pageserver
-pub fn migrate_tenant(
+pub async fn migrate_tenant(
    env: &LocalEnv,
    tenant_id: TenantId,
    dest_ps: PageServerNode,
@@ -108,16 +108,18 @@ pub fn migrate_tenant(
        }
    }

-    let previous = attachment_service.inspect(tenant_id)?;
+    let previous = attachment_service.inspect(tenant_id).await?;
    let mut baseline_lsns = None;
    if let Some((generation, origin_ps_id)) = &previous {
        let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?);

        if origin_ps_id == &dest_ps.conf.id {
            println!("🔁 Already attached to {origin_ps_id}, freshening...");
-            let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
+            let gen = attachment_service
+                .attach_hook(tenant_id, dest_ps.conf.id)
+                .await?;
            let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
-            dest_ps.location_config(tenant_id, dest_conf, None)?;
+            dest_ps.location_config(tenant_id, dest_conf, None).await?;
            println!("✅ Migration complete");
            return Ok(());
        }
@@ -126,20 +128,24 @@ pub fn migrate_tenant(

        let stale_conf =
            build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
-        origin_ps.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))?;
+        origin_ps
+            .location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))
+            .await?;

-        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
+        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
    }

-    let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
+    let gen = attachment_service
+        .attach_hook(tenant_id, dest_ps.conf.id)
+        .await?;
    let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);

    println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
-    dest_ps.location_config(tenant_id, dest_conf, None)?;
+    dest_ps.location_config(tenant_id, dest_conf, None).await?;

    if let Some(baseline) = baseline_lsns {
        println!("🕑 Waiting for LSN to catch up...");
-        await_lsn(tenant_id, &dest_ps, baseline)?;
+        await_lsn(tenant_id, &dest_ps, baseline).await?;
    }

    let cplane = ComputeControlPlane::load(env.clone())?;
@@ -149,7 +155,7 @@ pub fn migrate_tenant(
                "🔁 Reconfiguring endpoint {} to use pageserver {}",
                endpoint_name, dest_ps.conf.id
            );
-            endpoint.reconfigure(Some(dest_ps.conf.id))?;
+            endpoint.reconfigure(Some(dest_ps.conf.id)).await?;
        }
    }

@@ -159,7 +165,7 @@ pub fn migrate_tenant(
        }

        let other_ps = PageServerNode::from_env(env, other_ps_conf);
-        let other_ps_tenants = other_ps.tenant_list()?;
+        let other_ps_tenants = other_ps.tenant_list().await?;

        // Check if this tenant is attached
        let found = other_ps_tenants
@@ -181,7 +187,9 @@ pub fn migrate_tenant(
            "💤 Switching to secondary mode on pageserver {}",
            other_ps.conf.id
        );
-        other_ps.location_config(tenant_id, secondary_conf, None)?;
+        other_ps
+            .location_config(tenant_id, secondary_conf, None)
+            .await?;
    }

    println!(
@@ -189,7 +197,7 @@ pub fn migrate_tenant(
        dest_ps.conf.id
    );
    let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
-    dest_ps.location_config(tenant_id, dest_conf, None)?;
+    dest_ps.location_config(tenant_id, dest_conf, None).await?;

    println!("✅ Migration complete");

--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -3,8 +3,11 @@
 //! Otherwise, we might not see all metrics registered via
 //! a default registry.
 #![deny(clippy::undocumented_unsafe_blocks)]
+
 use once_cell::sync::Lazy;
-use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
+use prometheus::core::{
+    Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
+};
 pub use prometheus::opts;
 pub use prometheus::register;
 pub use prometheus::Error;
@@ -132,3 +135,137 @@ fn get_rusage_stats() -> libc::rusage {
        rusage.assume_init()
    }
 }
+
+/// Create an [`IntCounterPairVec`] and registers to default registry.
+#[macro_export(local_inner_macros)]
+macro_rules! register_int_counter_pair_vec {
+    ($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr, $LABELS_NAMES:expr $(,)?) => {{
+        match (
+            $crate::register_int_counter_vec!($NAME1, $HELP1, $LABELS_NAMES),
+            $crate::register_int_counter_vec!($NAME2, $HELP2, $LABELS_NAMES),
+        ) {
+            (Ok(inc), Ok(dec)) => Ok($crate::IntCounterPairVec::new(inc, dec)),
+            (Err(e), _) | (_, Err(e)) => Err(e),
+        }
+    }};
+}
+/// Create an [`IntCounterPair`] and registers to default registry.
+#[macro_export(local_inner_macros)]
+macro_rules! register_int_counter_pair {
+    ($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr $(,)?) => {{
+        match (
+            $crate::register_int_counter!($NAME1, $HELP1),
+            $crate::register_int_counter!($NAME2, $HELP2),
+        ) {
+            (Ok(inc), Ok(dec)) => Ok($crate::IntCounterPair::new(inc, dec)),
+            (Err(e), _) | (_, Err(e)) => Err(e),
+        }
+    }};
+}
+
+/// A Pair of [`GenericCounterVec`]s. Like an [`GenericGaugeVec`] but will always observe changes
+pub struct GenericCounterPairVec<P: Atomic> {
+    inc: GenericCounterVec<P>,
+    dec: GenericCounterVec<P>,
+}
+
+/// A Pair of [`GenericCounter`]s. Like an [`GenericGauge`] but will always observe changes
+pub struct GenericCounterPair<P: Atomic> {
+    inc: GenericCounter<P>,
+    dec: GenericCounter<P>,
+}
+
+impl<P: Atomic> GenericCounterPairVec<P> {
+    pub fn new(inc: GenericCounterVec<P>, dec: GenericCounterVec<P>) -> Self {
+        Self { inc, dec }
+    }
+
+    /// `get_metric_with_label_values` returns the [`GenericCounterPair<P>`] for the given slice
+    /// of label values (same order as the VariableLabels in Desc). If that combination of
+    /// label values is accessed for the first time, a new [`GenericCounterPair<P>`] is created.
+    ///
+    /// An error is returned if the number of label values is not the same as the
+    /// number of VariableLabels in Desc.
+    pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
+        Ok(GenericCounterPair {
+            inc: self.inc.get_metric_with_label_values(vals)?,
+            dec: self.dec.get_metric_with_label_values(vals)?,
+        })
+    }
+
+    /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
+    /// occurs.
+    pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair<P> {
+        self.get_metric_with_label_values(vals).unwrap()
+    }
+}
+
+impl<P: Atomic> GenericCounterPair<P> {
+    pub fn new(inc: GenericCounter<P>, dec: GenericCounter<P>) -> Self {
+        Self { inc, dec }
+    }
+
+    /// Increment the gauge by 1, returning a guard that decrements by 1 on drop.
+    pub fn guard(&self) -> GenericCounterPairGuard<P> {
+        self.inc.inc();
+        GenericCounterPairGuard(self.dec.clone())
+    }
+
+    /// Increment the gauge by n, returning a guard that decrements by n on drop.
+    pub fn guard_by(&self, n: P::T) -> GenericCounterPairGuardBy<P> {
+        self.inc.inc_by(n);
+        GenericCounterPairGuardBy(self.dec.clone(), n)
+    }
+
+    /// Increase the gauge by 1.
+    #[inline]
+    pub fn inc(&self) {
+        self.inc.inc();
+    }
+
+    /// Decrease the gauge by 1.
+    #[inline]
+    pub fn dec(&self) {
+        self.dec.inc();
+    }
+
+    /// Add the given value to the gauge. (The value can be
+    /// negative, resulting in a decrement of the gauge.)
+    #[inline]
+    pub fn inc_by(&self, v: P::T) {
+        self.inc.inc_by(v);
+    }
+
+    /// Subtract the given value from the gauge. (The value can be
+    /// negative, resulting in an increment of the gauge.)
+    #[inline]
+    pub fn dec_by(&self, v: P::T) {
+        self.dec.inc_by(v);
+    }
+}
+
+/// Guard returned by [`GenericCounterPair::guard`]
+pub struct GenericCounterPairGuard<P: Atomic>(GenericCounter<P>);
+
+impl<P: Atomic> Drop for GenericCounterPairGuard<P> {
+    fn drop(&mut self) {
+        self.0.inc();
+    }
+}
+/// Guard returned by [`GenericCounterPair::guard_by`]
+pub struct GenericCounterPairGuardBy<P: Atomic>(GenericCounter<P>, P::T);
+
+impl<P: Atomic> Drop for GenericCounterPairGuardBy<P> {
+    fn drop(&mut self) {
+        self.0.inc_by(self.1);
+    }
+}
+
+/// A Pair of [`IntCounterVec`]s. Like an [`IntGaugeVec`] but will always observe changes
+pub type IntCounterPairVec = GenericCounterPairVec<AtomicU64>;
+
+/// A Pair of [`IntCounter`]s. Like an [`IntGauge`] but will always observe changes
+pub type IntCounterPair = GenericCounterPair<AtomicU64>;
+
+/// A guard for [`IntCounterPair`] that will decrement the gauge on drop
+pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -24,3 +24,4 @@ workspace_hack.workspace = true

 [dev-dependencies]
 bincode.workspace = true
+rand.workspace = true
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -144,3 +144,37 @@ impl Key {
 pub fn is_rel_block_key(key: &Key) -> bool {
    key.field1 == 0x00 && key.field4 != 0
 }
+
+impl std::str::FromStr for Key {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+        Self::from_hex(s)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::str::FromStr;
+
+    use crate::key::Key;
+
+    use rand::Rng;
+    use rand::SeedableRng;
+
+    #[test]
+    fn display_fromstr_bijection() {
+        let mut rng = rand::rngs::StdRng::seed_from_u64(42);
+
+        let key = Key {
+            field1: rng.gen(),
+            field2: rng.gen(),
+            field3: rng.gen(),
+            field4: rng.gen(),
+            field5: rng.gen(),
+            field6: rng.gen(),
+        };
+
+        assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
+    }
+}
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -1,11 +1,12 @@
-use crate::repository::{key_range_size, singleton_range, Key};
 use postgres_ffi::BLCKSZ;
 use std::ops::Range;

+use crate::key::Key;
+
 ///
 /// Represents a set of Keys, in a compact form.
 ///
-#[derive(Clone, Debug, Default)]
+#[derive(Clone, Debug, Default, PartialEq, Eq)]
 pub struct KeySpace {
    /// Contiguous ranges of keys that belong to the key space. In key order,
    /// and with no overlap.
@@ -186,6 +187,33 @@ impl KeySpaceRandomAccum {
    }
 }

+pub fn key_range_size(key_range: &Range<Key>) -> u32 {
+    let start = key_range.start;
+    let end = key_range.end;
+
+    if end.field1 != start.field1
+        || end.field2 != start.field2
+        || end.field3 != start.field3
+        || end.field4 != start.field4
+    {
+        return u32::MAX;
+    }
+
+    let start = (start.field5 as u64) << 32 | start.field6 as u64;
+    let end = (end.field5 as u64) << 32 | end.field6 as u64;
+
+    let diff = end - start;
+    if diff > u32::MAX as u64 {
+        u32::MAX
+    } else {
+        diff as u32
+    }
+}
+
+pub fn singleton_range(key: Key) -> Range<Key> {
+    key..key.next()
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -5,6 +5,7 @@ use const_format::formatcp;
 /// Public API types
 pub mod control_api;
 pub mod key;
+pub mod keyspace;
 pub mod models;
 pub mod reltag;
 pub mod shard;
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1,5 +1,8 @@
+pub mod partitioning;
+
 use std::{
    collections::HashMap,
+    io::Read,
    num::{NonZeroU64, NonZeroUsize},
    time::SystemTime,
 };
@@ -17,7 +20,7 @@ use utils::{

 use crate::{reltag::RelTag, shard::TenantShardId};
 use anyhow::bail;
-use bytes::{BufMut, Bytes, BytesMut};
+use bytes::{Buf, BufMut, Bytes, BytesMut};

 /// The state of a tenant in this pageserver.
 ///
@@ -237,6 +240,7 @@ pub struct TenantConfig {
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
    pub gc_feedback: Option<bool>,
+    pub heatmap_period: Option<String>,
 }

 /// A flattened analog of a `pagesever::tenant::LocationMode`, which
@@ -573,6 +577,7 @@ pub enum PagestreamFeMessage {
 }

 // Wrapped in libpq CopyData
+#[derive(strum_macros::EnumProperty)]
 pub enum PagestreamBeMessage {
    Exists(PagestreamExistsResponse),
    Nblocks(PagestreamNblocksResponse),
@@ -581,6 +586,29 @@ pub enum PagestreamBeMessage {
    DbSize(PagestreamDbSizeResponse),
 }

+// Keep in sync with `pagestore_client.h`
+#[repr(u8)]
+enum PagestreamBeMessageTag {
+    Exists = 100,
+    Nblocks = 101,
+    GetPage = 102,
+    Error = 103,
+    DbSize = 104,
+}
+impl TryFrom<u8> for PagestreamBeMessageTag {
+    type Error = u8;
+    fn try_from(value: u8) -> Result<Self, u8> {
+        match value {
+            100 => Ok(PagestreamBeMessageTag::Exists),
+            101 => Ok(PagestreamBeMessageTag::Nblocks),
+            102 => Ok(PagestreamBeMessageTag::GetPage),
+            103 => Ok(PagestreamBeMessageTag::Error),
+            104 => Ok(PagestreamBeMessageTag::DbSize),
+            _ => Err(value),
+        }
+    }
+}
+
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamExistsRequest {
    pub latest: bool,
@@ -736,35 +764,91 @@ impl PagestreamBeMessage {
    pub fn serialize(&self) -> Bytes {
        let mut bytes = BytesMut::new();

+        use PagestreamBeMessageTag as Tag;
        match self {
            Self::Exists(resp) => {
-                bytes.put_u8(100); /* tag from pagestore_client.h */
+                bytes.put_u8(Tag::Exists as u8);
                bytes.put_u8(resp.exists as u8);
            }

            Self::Nblocks(resp) => {
-                bytes.put_u8(101); /* tag from pagestore_client.h */
+                bytes.put_u8(Tag::Nblocks as u8);
                bytes.put_u32(resp.n_blocks);
            }

            Self::GetPage(resp) => {
-                bytes.put_u8(102); /* tag from pagestore_client.h */
+                bytes.put_u8(Tag::GetPage as u8);
                bytes.put(&resp.page[..]);
            }

            Self::Error(resp) => {
-                bytes.put_u8(103); /* tag from pagestore_client.h */
+                bytes.put_u8(Tag::Error as u8);
                bytes.put(resp.message.as_bytes());
                bytes.put_u8(0); // null terminator
            }
            Self::DbSize(resp) => {
-                bytes.put_u8(104); /* tag from pagestore_client.h */
+                bytes.put_u8(Tag::DbSize as u8);
                bytes.put_i64(resp.db_size);
            }
        }

        bytes.into()
    }
+
+    pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
+        let mut buf = buf.reader();
+        let msg_tag = buf.read_u8()?;
+
+        use PagestreamBeMessageTag as Tag;
+        let ok =
+            match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
+                Tag::Exists => {
+                    let exists = buf.read_u8()?;
+                    Self::Exists(PagestreamExistsResponse {
+                        exists: exists != 0,
+                    })
+                }
+                Tag::Nblocks => {
+                    let n_blocks = buf.read_u32::<BigEndian>()?;
+                    Self::Nblocks(PagestreamNblocksResponse { n_blocks })
+                }
+                Tag::GetPage => {
+                    let mut page = vec![0; 8192]; // TODO: use MaybeUninit
+                    buf.read_exact(&mut page)?;
+                    PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
+                }
+                Tag::Error => {
+                    let buf = buf.get_ref();
+                    let cstr = std::ffi::CStr::from_bytes_until_nul(buf)?;
+                    let rust_str = cstr.to_str()?;
+                    PagestreamBeMessage::Error(PagestreamErrorResponse {
+                        message: rust_str.to_owned(),
+                    })
+                }
+                Tag::DbSize => {
+                    let db_size = buf.read_i64::<BigEndian>()?;
+                    Self::DbSize(PagestreamDbSizeResponse { db_size })
+                }
+            };
+        let remaining = buf.into_inner();
+        if !remaining.is_empty() {
+            anyhow::bail!(
+                "remaining bytes in msg with tag={msg_tag}: {}",
+                remaining.len()
+            );
+        }
+        Ok(ok)
+    }
+
+    pub fn kind(&self) -> &'static str {
+        match self {
+            Self::Exists(_) => "Exists",
+            Self::Nblocks(_) => "Nblocks",
+            Self::GetPage(_) => "GetPage",
+            Self::Error(_) => "Error",
+            Self::DbSize(_) => "DbSize",
+        }
+    }
 }

 #[cfg(test)]
--- a/libs/pageserver_api/src/models/partitioning.rs
+++ b/libs/pageserver_api/src/models/partitioning.rs
@@ -0,0 +1,151 @@
+use utils::lsn::Lsn;
+
+#[derive(Debug, PartialEq, Eq)]
+pub struct Partitioning {
+    pub keys: crate::keyspace::KeySpace,
+
+    pub at_lsn: Lsn,
+}
+
+impl serde::Serialize for Partitioning {
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        pub struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
+
+        impl<'a> serde::Serialize for KeySpace<'a> {
+            fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+            where
+                S: serde::Serializer,
+            {
+                use serde::ser::SerializeSeq;
+                let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
+                for kr in &self.0.ranges {
+                    seq.serialize_element(&KeyRange(kr))?;
+                }
+                seq.end()
+            }
+        }
+
+        use serde::ser::SerializeMap;
+        let mut map = serializer.serialize_map(Some(2))?;
+        map.serialize_key("keys")?;
+        map.serialize_value(&KeySpace(&self.keys))?;
+        map.serialize_key("at_lsn")?;
+        map.serialize_value(&WithDisplay(&self.at_lsn))?;
+        map.end()
+    }
+}
+
+pub struct WithDisplay<'a, T>(&'a T);
+
+impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        serializer.collect_str(&self.0)
+    }
+}
+
+pub struct KeyRange<'a>(&'a std::ops::Range<crate::key::Key>);
+
+impl<'a> serde::Serialize for KeyRange<'a> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeTuple;
+        let mut t = serializer.serialize_tuple(2)?;
+        t.serialize_element(&WithDisplay(&self.0.start))?;
+        t.serialize_element(&WithDisplay(&self.0.end))?;
+        t.end()
+    }
+}
+
+impl<'a> serde::Deserialize<'a> for Partitioning {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'a>,
+    {
+        pub struct KeySpace(crate::keyspace::KeySpace);
+
+        impl<'de> serde::Deserialize<'de> for KeySpace {
+            fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+            where
+                D: serde::Deserializer<'de>,
+            {
+                #[serde_with::serde_as]
+                #[derive(serde::Deserialize)]
+                #[serde(transparent)]
+                struct Key(#[serde_as(as = "serde_with::DisplayFromStr")] crate::key::Key);
+
+                #[serde_with::serde_as]
+                #[derive(serde::Deserialize)]
+                struct Range(Key, Key);
+
+                let ranges: Vec<Range> = serde::Deserialize::deserialize(deserializer)?;
+                Ok(Self(crate::keyspace::KeySpace {
+                    ranges: ranges
+                        .into_iter()
+                        .map(|Range(start, end)| (start.0..end.0))
+                        .collect(),
+                }))
+            }
+        }
+
+        #[serde_with::serde_as]
+        #[derive(serde::Deserialize)]
+        struct De {
+            keys: KeySpace,
+            #[serde_as(as = "serde_with::DisplayFromStr")]
+            at_lsn: Lsn,
+        }
+
+        let de: De = serde::Deserialize::deserialize(deserializer)?;
+        Ok(Self {
+            at_lsn: de.at_lsn,
+            keys: de.keys.0,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_serialization_roundtrip() {
+        let reference = r#"
+        {
+            "keys": [
+              [
+                "000000000000000000000000000000000000",
+                "000000000000000000000000000000000001"
+              ],
+              [
+                "000000067F00000001000000000000000000",
+                "000000067F00000001000000000000000002"
+              ],
+              [
+                "030000000000000000000000000000000000",
+                "030000000000000000000000000000000003"
+              ]
+            ],
+            "at_lsn": "0/2240160"
+        }
+        "#;
+
+        let de: Partitioning = serde_json::from_str(reference).unwrap();
+
+        let ser = serde_json::to_string(&de).unwrap();
+
+        let ser_de: serde_json::Value = serde_json::from_str(&ser).unwrap();
+
+        assert_eq!(
+            ser_de,
+            serde_json::from_str::<'_, serde_json::Value>(reference).unwrap()
+        );
+    }
+}
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -163,8 +163,18 @@ impl PgConnectionConfig {
    }

    /// Connect using postgres protocol with TLS disabled.
-    pub fn connect_no_tls(&self) -> Result<postgres::Client, postgres::Error> {
-        postgres::Config::from(self.to_tokio_postgres_config()).connect(postgres::NoTls)
+    pub async fn connect_no_tls(
+        &self,
+    ) -> Result<
+        (
+            tokio_postgres::Client,
+            tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
+        ),
+        postgres::Error,
+    > {
+        self.to_tokio_postgres_config()
+            .connect(postgres::NoTls)
+            .await
    }
 }

--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -50,6 +50,8 @@ const_format.workspace = true
 # why is it only here? no other crate should use it, streams are rarely needed.
 tokio-stream = { version = "0.1.14" }

+serde_path_to_error.workspace = true
+
 [dev-dependencies]
 byteorder.workspace = true
 bytes.workspace = true
--- a/libs/utils/src/http/json.rs
+++ b/libs/utils/src/http/json.rs
@@ -25,8 +25,12 @@ pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
    if body.remaining() == 0 {
        return Ok(None);
    }
-    serde_json::from_reader(body.reader())
-        .context("Failed to parse json request")
+
+    let mut deser = serde_json::de::Deserializer::from_reader(body.reader());
+
+    serde_path_to_error::deserialize(&mut deser)
+        // intentionally stringify because the debug version is not helpful in python logs
+        .map_err(|e| anyhow::anyhow!("Failed to parse json request: {e}"))
        .map(Some)
        .map_err(ApiError::BadRequest)
 }
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,6 +1,7 @@
 use std::str::FromStr;

 use anyhow::Context;
+use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, EnumVariantNames};

@@ -24,16 +25,48 @@ impl LogFormat {
    }
 }

-static TRACING_EVENT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
-    metrics::register_int_counter_vec!(
+struct TracingEventCountMetric {
+    error: IntCounter,
+    warn: IntCounter,
+    info: IntCounter,
+    debug: IntCounter,
+    trace: IntCounter,
+}
+
+static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
+    let vec = metrics::register_int_counter_vec!(
        "libmetrics_tracing_event_count",
        "Number of tracing events, by level",
        &["level"]
    )
-    .expect("failed to define metric")
+    .expect("failed to define metric");
+    TracingEventCountMetric::new(vec)
 });

-struct TracingEventCountLayer(&'static metrics::IntCounterVec);
+impl TracingEventCountMetric {
+    fn new(vec: IntCounterVec) -> Self {
+        Self {
+            error: vec.with_label_values(&["error"]),
+            warn: vec.with_label_values(&["warn"]),
+            info: vec.with_label_values(&["info"]),
+            debug: vec.with_label_values(&["debug"]),
+            trace: vec.with_label_values(&["trace"]),
+        }
+    }
+
+    fn inc_for_level(&self, level: tracing::Level) {
+        let counter = match level {
+            tracing::Level::ERROR => &self.error,
+            tracing::Level::WARN => &self.warn,
+            tracing::Level::INFO => &self.info,
+            tracing::Level::DEBUG => &self.debug,
+            tracing::Level::TRACE => &self.trace,
+        };
+        counter.inc();
+    }
+}
+
+struct TracingEventCountLayer(&'static TracingEventCountMetric);

 impl<S> tracing_subscriber::layer::Layer<S> for TracingEventCountLayer
 where
@@ -44,15 +77,7 @@ where
        event: &tracing::Event<'_>,
        _ctx: tracing_subscriber::layer::Context<'_, S>,
    ) {
-        let level = event.metadata().level();
-        let level = match *level {
-            tracing::Level::ERROR => "error",
-            tracing::Level::WARN => "warn",
-            tracing::Level::INFO => "info",
-            tracing::Level::DEBUG => "debug",
-            tracing::Level::TRACE => "trace",
-        };
-        self.0.with_label_values(&[level]).inc();
+        self.0.inc_for_level(*event.metadata().level());
    }
 }

@@ -106,7 +131,9 @@ pub fn init(
        };
        log_layer.with_filter(rust_log_env_filter())
    });
-    let r = r.with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter()));
+    let r = r.with(
+        TracingEventCountLayer(&TRACING_EVENT_COUNT_METRIC).with_filter(rust_log_env_filter()),
+    );
    match tracing_error_layer_enablement {
        TracingErrorLayerEnablement::EnableWithRustLogFilter => r
            .with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter()))
@@ -257,14 +284,14 @@ impl std::fmt::Debug for SecretString {
 mod tests {
    use metrics::{core::Opts, IntCounterVec};

-    use super::TracingEventCountLayer;
+    use crate::logging::{TracingEventCountLayer, TracingEventCountMetric};

    #[test]
    fn tracing_event_count_metric() {
        let counter_vec =
            IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap();
-        let counter_vec = Box::leak(Box::new(counter_vec)); // make it 'static
-        let layer = TracingEventCountLayer(counter_vec);
+        let metric = Box::leak(Box::new(TracingEventCountMetric::new(counter_vec.clone())));
+        let layer = TracingEventCountLayer(metric);
        use tracing_subscriber::prelude::*;

        tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || {
--- a/libs/utils/src/timeout.rs
+++ b/libs/utils/src/timeout.rs
@@ -2,8 +2,11 @@ use std::time::Duration;

 use tokio_util::sync::CancellationToken;

+#[derive(thiserror::Error, Debug)]
 pub enum TimeoutCancellableError {
+    #[error("Timed out")]
    Timeout,
+    #[error("Cancelled")]
    Cancelled,
 }

--- a/libs/walproposer/bindgen_deps.h
+++ b/libs/walproposer/bindgen_deps.h
@@ -1 +1,2 @@
+#include "postgres.h"
 #include "walproposer.h"
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -1,3 +1,6 @@
+//! Links with walproposer, pgcommon, pgport and runs bindgen on walproposer.h
+//! to generate Rust bindings for it.
+
 use std::{env, path::PathBuf, process::Command};

 use anyhow::{anyhow, Context};
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -1,3 +1,6 @@
+//! A C-Rust shim: defines implementation of C walproposer API, assuming wp
+//! callback_data stores Box to some Rust implementation.
+
 #![allow(dead_code)]

 use std::ffi::CStr;
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -436,9 +436,9 @@ mod tests {
                event_mask: 0,
            }),
            expected_messages: vec![
-                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160000, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
+                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160001, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
                vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
                    147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
                    188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
@@ -478,7 +478,7 @@ mod tests {
        // walproposer will panic when it finishes sync_safekeepers
        std::panic::catch_unwind(|| wp.start()).unwrap_err();
        // validate the resulting LSN
-        assert_eq!(receiver.recv()?, 1337);
+        assert_eq!(receiver.try_recv(), Ok(1337));
        Ok(())
        // drop() will free up resources here
    }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -36,6 +36,7 @@ humantime.workspace = true
 humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
+md5.workspace = true
 nix.workspace = true
 # hack to get the number of worker threads tokio uses
 num_cpus = { version = "1.15" }
@@ -62,6 +63,7 @@ thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
 tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
+tokio-stream.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
 tracing.workspace = true
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name = "pageserver_client"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+pageserver_api.workspace = true
+thiserror.workspace = true
+async-trait.workspace = true
+reqwest.workspace = true
+utils.workspace = true
+serde.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+tokio-postgres.workspace = true
+tokio-stream.workspace = true
+tokio.workspace = true
+futures.workspace = true
+tokio-util.workspace = true
+anyhow.workspace = true
+postgres.workspace = true
+bytes.workspace = true
--- a/pageserver/client/src/lib.rs
+++ b/pageserver/client/src/lib.rs
@@ -0,0 +1,2 @@
+pub mod mgmt_api;
+pub mod page_service;
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -0,0 +1,188 @@
+use pageserver_api::models::*;
+use reqwest::{IntoUrl, Method};
+use utils::{
+    http::error::HttpErrorBody,
+    id::{TenantId, TimelineId},
+};
+
+#[derive(Debug)]
+pub struct Client {
+    mgmt_api_endpoint: String,
+    authorization_header: Option<String>,
+    client: reqwest::Client,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+    #[error("receive body: {0}")]
+    ReceiveBody(reqwest::Error),
+
+    #[error("receive error body: {0}")]
+    ReceiveErrorBody(String),
+
+    #[error("pageserver API: {0}")]
+    ApiError(String),
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+#[async_trait::async_trait]
+pub trait ResponseErrorMessageExt: Sized {
+    async fn error_from_body(self) -> Result<Self>;
+}
+
+#[async_trait::async_trait]
+impl ResponseErrorMessageExt for reqwest::Response {
+    async fn error_from_body(mut self) -> Result<Self> {
+        let status = self.status();
+        if !(status.is_client_error() || status.is_server_error()) {
+            return Ok(self);
+        }
+
+        let url = self.url().to_owned();
+        Err(match self.json::<HttpErrorBody>().await {
+            Ok(HttpErrorBody { msg }) => Error::ApiError(msg),
+            Err(_) => {
+                Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), url))
+            }
+        })
+    }
+}
+
+impl Client {
+    pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
+        Self {
+            mgmt_api_endpoint,
+            authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")),
+            client: reqwest::Client::new(),
+        }
+    }
+
+    pub async fn list_tenants(&self) -> Result<Vec<pageserver_api::models::TenantInfo>> {
+        let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint);
+        let resp = self.get(&uri).await?;
+        resp.json().await.map_err(Error::ReceiveBody)
+    }
+
+    pub async fn list_timelines(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<Vec<pageserver_api::models::TimelineInfo>> {
+        let uri = format!("{}/v1/tenant/{tenant_id}/timeline", self.mgmt_api_endpoint);
+        self.get(&uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn timeline_info(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<pageserver_api::models::TimelineInfo> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
+            self.mgmt_api_endpoint
+        );
+        self.get(&uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn keyspace(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<pageserver_api::models::partitioning::Partitioning> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace",
+            self.mgmt_api_endpoint
+        );
+        self.get(&uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    async fn get<U: IntoUrl>(&self, uri: U) -> Result<reqwest::Response> {
+        self.request(Method::GET, uri, ()).await
+    }
+
+    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
+        &self,
+        method: Method,
+        uri: U,
+        body: B,
+    ) -> Result<reqwest::Response> {
+        let req = self.client.request(method, uri);
+        let req = if let Some(value) = &self.authorization_header {
+            req.header(reqwest::header::AUTHORIZATION, value)
+        } else {
+            req
+        };
+        let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?;
+        let response = res.error_from_body().await?;
+        Ok(response)
+    }
+
+    pub async fn status(&self) -> Result<()> {
+        let uri = format!("{}/v1/status", self.mgmt_api_endpoint);
+        self.get(&uri).await?;
+        Ok(())
+    }
+
+    pub async fn tenant_create(&self, req: &TenantCreateRequest) -> Result<TenantId> {
+        let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint);
+        self.request(Method::POST, &uri, req)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
+        let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
+        self.request(Method::PUT, &uri, req).await?;
+        Ok(())
+    }
+
+    pub async fn location_config(
+        &self,
+        tenant_id: TenantId,
+        config: LocationConfig,
+        flush_ms: Option<std::time::Duration>,
+    ) -> Result<()> {
+        let req_body = TenantLocationConfigRequest { tenant_id, config };
+        let path = format!(
+            "{}/v1/tenant/{}/location_config",
+            self.mgmt_api_endpoint, tenant_id
+        );
+        let path = if let Some(flush_ms) = flush_ms {
+            format!("{}?flush_ms={}", path, flush_ms.as_millis())
+        } else {
+            path
+        };
+        self.request(Method::PUT, &path, &req_body).await?;
+        Ok(())
+    }
+
+    pub async fn timeline_create(
+        &self,
+        tenant_id: TenantId,
+        req: &TimelineCreateRequest,
+    ) -> Result<TimelineInfo> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline",
+            self.mgmt_api_endpoint, tenant_id
+        );
+        self.request(Method::POST, &uri, req)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+}
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -0,0 +1,151 @@
+use std::pin::Pin;
+
+use futures::SinkExt;
+use pageserver_api::{
+    models::{
+        PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
+        PagestreamGetPageResponse,
+    },
+    reltag::RelTag,
+};
+use tokio::task::JoinHandle;
+use tokio_postgres::CopyOutStream;
+use tokio_stream::StreamExt;
+use tokio_util::sync::CancellationToken;
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+pub struct Client {
+    client: tokio_postgres::Client,
+    cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
+    conn_task: JoinHandle<()>,
+}
+
+pub struct BasebackupRequest {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub lsn: Option<Lsn>,
+    pub gzip: bool,
+}
+
+impl Client {
+    pub async fn new(connstring: String) -> anyhow::Result<Self> {
+        let (client, connection) = tokio_postgres::connect(&connstring, postgres::NoTls).await?;
+
+        let conn_task_cancel = CancellationToken::new();
+        let conn_task = tokio::spawn({
+            let conn_task_cancel = conn_task_cancel.clone();
+            async move {
+                tokio::select! {
+                    _ = conn_task_cancel.cancelled() => { }
+                    res = connection => {
+                        res.unwrap();
+                    }
+                }
+            }
+        });
+        Ok(Self {
+            cancel_on_client_drop: Some(conn_task_cancel.drop_guard()),
+            conn_task,
+            client,
+        })
+    }
+
+    pub async fn pagestream(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> anyhow::Result<PagestreamClient> {
+        let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
+            .client
+            .copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}"))
+            .await?;
+        let Client {
+            cancel_on_client_drop,
+            conn_task,
+            client: _,
+        } = self;
+        Ok(PagestreamClient {
+            copy_both: Box::pin(copy_both),
+            conn_task,
+            cancel_on_client_drop,
+        })
+    }
+
+    pub async fn basebackup(&self, req: &BasebackupRequest) -> anyhow::Result<CopyOutStream> {
+        let BasebackupRequest {
+            tenant_id,
+            timeline_id,
+            lsn,
+            gzip,
+        } = req;
+        let mut args = Vec::with_capacity(5);
+        args.push("basebackup".to_string());
+        args.push(format!("{tenant_id}"));
+        args.push(format!("{timeline_id}"));
+        if let Some(lsn) = lsn {
+            args.push(format!("{lsn}"));
+        }
+        if *gzip {
+            args.push("--gzip".to_string())
+        }
+        Ok(self.client.copy_out(&args.join(" ")).await?)
+    }
+}
+
+/// Create using [`Client::pagestream`].
+pub struct PagestreamClient {
+    copy_both: Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
+    cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
+    conn_task: JoinHandle<()>,
+}
+
+pub struct RelTagBlockNo {
+    pub rel_tag: RelTag,
+    pub block_no: u32,
+}
+
+impl PagestreamClient {
+    pub async fn shutdown(mut self) {
+        let _ = self.cancel_on_client_drop.take();
+        self.conn_task.await.unwrap();
+    }
+
+    pub async fn getpage(
+        &mut self,
+        key: RelTagBlockNo,
+        lsn: Lsn,
+    ) -> anyhow::Result<PagestreamGetPageResponse> {
+        let req = PagestreamGetPageRequest {
+            latest: false,
+            rel: key.rel_tag,
+            blkno: key.block_no,
+            lsn,
+        };
+        let req = PagestreamFeMessage::GetPage(req);
+        let req: bytes::Bytes = req.serialize();
+        // let mut req = tokio_util::io::ReaderStream::new(&req);
+        let mut req = tokio_stream::once(Ok(req));
+
+        self.copy_both.send_all(&mut req).await?;
+
+        let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
+        let next: bytes::Bytes = next.unwrap()?;
+
+        let msg = PagestreamBeMessage::deserialize(next)?;
+        match msg {
+            PagestreamBeMessage::GetPage(p) => Ok(p),
+            PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
+            PagestreamBeMessage::Exists(_)
+            | PagestreamBeMessage::Nblocks(_)
+            | PagestreamBeMessage::DbSize(_) => {
+                anyhow::bail!(
+                    "unexpected be message kind in response to getpage request: {}",
+                    msg.kind()
+                )
+            }
+        }
+    }
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -14,7 +14,7 @@ use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
-use pageserver::tenant::TenantSharedResources;
+use pageserver::tenant::{secondary, TenantSharedResources};
 use remote_storage::GenericRemoteStorage;
 use tokio::time::Instant;
 use tracing::*;
@@ -504,6 +504,17 @@ fn start_pageserver(
        }
    });

+    let secondary_controller = if let Some(remote_storage) = &remote_storage {
+        secondary::spawn_tasks(
+            tenant_manager.clone(),
+            remote_storage.clone(),
+            background_jobs_barrier.clone(),
+            shutdown_pageserver.clone(),
+        )
+    } else {
+        secondary::null_controller()
+    };
+
    // shared state between the disk-usage backed eviction background task and the http endpoint
    // that allows triggering disk-usage based eviction manually. note that the http endpoint
    // is still accessible even if background task is not configured as long as remote storage has
@@ -533,6 +544,7 @@ fn start_pageserver(
                broker_client.clone(),
                disk_usage_eviction_state,
                deletion_queue.new_client(),
+                secondary_controller,
            )
            .context("Failed to initialize router state")?,
        );
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -41,6 +41,8 @@ use crate::{
    TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
 };

+use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
+
 pub mod defaults {
    use crate::tenant::config::defaults::*;
    use const_format::formatcp;
@@ -61,6 +63,8 @@ pub mod defaults {

    pub const DEFAULT_LOG_FORMAT: &str = "plain";

+    pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8;
+
    pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
        super::ConfigurableSemaphore::DEFAULT_INITIAL.get();

@@ -70,6 +74,8 @@ pub mod defaults {
    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";

+    pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
+
    ///
    /// Default built-in configuration file.
    ///
@@ -92,6 +98,7 @@ pub mod defaults {
 #log_format = '{DEFAULT_LOG_FORMAT}'

 #concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
+#concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}'

 #metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
 #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
@@ -117,6 +124,8 @@ pub mod defaults {
 #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
 #gc_feedback = false

+#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
+
 [remote_storage]

 "#
@@ -176,6 +185,11 @@ pub struct PageServerConf {

    pub log_format: LogFormat,

+    /// Number of tenants which will be concurrently loaded from remote storage proactively on startup,
+    /// does not limit tenants loaded in response to client I/O.  A lower value implicitly deprioritizes
+    /// loading such tenants, vs. other work in the system.
+    pub concurrent_tenant_warmup: ConfigurableSemaphore,
+
    /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
    pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
    /// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
@@ -215,6 +229,10 @@ pub struct PageServerConf {
    /// If true, pageserver will make best-effort to operate without a control plane: only
    /// for use in major incidents.
    pub control_plane_emergency_mode: bool,
+
+    /// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
+    /// heatmap uploads vs. other remote storage operations.
+    pub heatmap_upload_concurrency: usize,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -275,6 +293,7 @@ struct PageServerConfigBuilder {

    log_format: BuilderValue<LogFormat>,

+    concurrent_tenant_warmup: BuilderValue<NonZeroUsize>,
    concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,

    metric_collection_interval: BuilderValue<Duration>,
@@ -293,6 +312,8 @@ struct PageServerConfigBuilder {
    control_plane_api: BuilderValue<Option<Url>>,
    control_plane_api_token: BuilderValue<Option<SecretString>>,
    control_plane_emergency_mode: BuilderValue<bool>,
+
+    heatmap_upload_concurrency: BuilderValue<usize>,
 }

 impl Default for PageServerConfigBuilder {
@@ -330,6 +351,8 @@ impl Default for PageServerConfigBuilder {
            .expect("cannot parse default keepalive interval")),
            log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),

+            concurrent_tenant_warmup: Set(NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
+                .expect("Invalid default constant")),
            concurrent_tenant_size_logical_size_queries: Set(
                ConfigurableSemaphore::DEFAULT_INITIAL,
            ),
@@ -361,6 +384,8 @@ impl Default for PageServerConfigBuilder {
            control_plane_api: Set(None),
            control_plane_api_token: Set(None),
            control_plane_emergency_mode: Set(false),
+
+            heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
        }
    }
 }
@@ -441,6 +466,10 @@ impl PageServerConfigBuilder {
        self.log_format = BuilderValue::Set(log_format)
    }

+    pub fn concurrent_tenant_warmup(&mut self, u: NonZeroUsize) {
+        self.concurrent_tenant_warmup = BuilderValue::Set(u);
+    }
+
    pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) {
        self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
    }
@@ -501,7 +530,14 @@ impl PageServerConfigBuilder {
        self.control_plane_emergency_mode = BuilderValue::Set(enabled)
    }

+    pub fn heatmap_upload_concurrency(&mut self, value: usize) {
+        self.heatmap_upload_concurrency = BuilderValue::Set(value)
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
+        let concurrent_tenant_warmup = self
+            .concurrent_tenant_warmup
+            .ok_or(anyhow!("missing concurrent_tenant_warmup"))?;
        let concurrent_tenant_size_logical_size_queries = self
            .concurrent_tenant_size_logical_size_queries
            .ok_or(anyhow!(
@@ -554,6 +590,7 @@ impl PageServerConfigBuilder {
                .broker_keepalive_interval
                .ok_or(anyhow!("No broker keepalive interval provided"))?,
            log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
+            concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup),
            concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
                concurrent_tenant_size_logical_size_queries,
            ),
@@ -595,6 +632,10 @@ impl PageServerConfigBuilder {
            control_plane_emergency_mode: self
                .control_plane_emergency_mode
                .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
+
+            heatmap_upload_concurrency: self
+                .heatmap_upload_concurrency
+                .ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
        })
    }
 }
@@ -787,6 +828,11 @@ impl PageServerConf {
                "log_format" => builder.log_format(
                    LogFormat::from_config(&parse_toml_string(key, item)?)?
                ),
+                "concurrent_tenant_warmup" => builder.concurrent_tenant_warmup({
+                    let input = parse_toml_string(key, item)?;
+                    let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
+                    NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
+                }),
                "concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({
                    let input = parse_toml_string(key, item)?;
                    let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
@@ -828,7 +874,9 @@ impl PageServerConf {
                },
                "control_plane_emergency_mode" => {
                    builder.control_plane_emergency_mode(parse_toml_bool(key, item)?)
-
+                },
+                "heatmap_upload_concurrency" => {
+                    builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
                },
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
@@ -882,6 +930,10 @@ impl PageServerConf {
            broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
            broker_keepalive_interval: Duration::from_secs(5000),
            log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
+            concurrent_tenant_warmup: ConfigurableSemaphore::new(
+                NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
+                    .expect("Invalid default constant"),
+            ),
            concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
            eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
            ),
@@ -896,6 +948,7 @@ impl PageServerConf {
            control_plane_api: None,
            control_plane_api_token: None,
            control_plane_emergency_mode: false,
+            heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
        }
    }
 }
@@ -1099,6 +1152,9 @@ background_task_maximum_delay = '334 s'
                    storage_broker::DEFAULT_KEEPALIVE_INTERVAL
                )?,
                log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
+                concurrent_tenant_warmup: ConfigurableSemaphore::new(
+                    NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap()
+                ),
                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
                eviction_task_immitated_concurrent_logical_size_queries:
                    ConfigurableSemaphore::default(),
@@ -1120,7 +1176,8 @@ background_task_maximum_delay = '334 s'
                )?,
                control_plane_api: None,
                control_plane_api_token: None,
-                control_plane_emergency_mode: false
+                control_plane_emergency_mode: false,
+                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1164,6 +1221,9 @@ background_task_maximum_delay = '334 s'
                broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
                broker_keepalive_interval: Duration::from_secs(5),
                log_format: LogFormat::Json,
+                concurrent_tenant_warmup: ConfigurableSemaphore::new(
+                    NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap()
+                ),
                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
                eviction_task_immitated_concurrent_logical_size_queries:
                    ConfigurableSemaphore::default(),
@@ -1177,7 +1237,8 @@ background_task_maximum_delay = '334 s'
                background_task_maximum_delay: Duration::from_secs(334),
                control_plane_api: None,
                control_plane_api_token: None,
-                control_plane_emergency_mode: false
+                control_plane_emergency_mode: false,
+                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -3,7 +3,7 @@
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError};
+use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
@@ -256,8 +256,6 @@ async fn calculate_synthetic_size_worker(
        info!("calculate_synthetic_size_worker stopped");
    };

-    let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
-
    loop {
        let started_at = Instant::now();

@@ -280,29 +278,14 @@ async fn calculate_synthetic_size_worker(
                continue;
            }

-            if let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) {
-                // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
-                // We can put in some prioritization for consumption metrics.
-                // Same for the loop that fetches computed metrics.
-                // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
-                // which turns out is really handy to understand the system.
-                if let Err(e) = tenant.calculate_synthetic_size(cause, cancel, ctx).await {
-                    // this error can be returned if timeline is shutting down, but it does not
-                    // mean the synthetic size worker should terminate. we do not need any checks
-                    // in this function because `mgr::get_tenant` will error out after shutdown has
-                    // progressed to shutting down tenants.
-                    let is_cancelled = matches!(
-                        e.downcast_ref::<PageReconstructError>(),
-                        Some(PageReconstructError::Cancelled)
-                    );
+            let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else {
+                continue;
+            };

-                    if !is_cancelled {
-                        error!(
-                            "failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}"
-                        );
-                    }
-                }
-            }
+            // there is never any reason to exit calculate_synthetic_size_worker following any
+            // return value -- we don't need to care about shutdown because no tenant is found when
+            // pageserver is shut down.
+            calculate_and_log(&tenant, cancel, ctx).await;
        }

        crate::tenant::tasks::warn_when_period_overrun(
@@ -321,3 +304,31 @@ async fn calculate_synthetic_size_worker(
        }
    }
 }
+
+async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext) {
+    const CAUSE: LogicalSizeCalculationCause =
+        LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
+
+    // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
+    // We can put in some prioritization for consumption metrics.
+    // Same for the loop that fetches computed metrics.
+    // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
+    // which turns out is really handy to understand the system.
+    let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else {
+        return;
+    };
+
+    // this error can be returned if timeline is shutting down, but it does not
+    // mean the synthetic size worker should terminate. we do not need any checks
+    // in this function because `mgr::get_tenant` will error out after shutdown has
+    // progressed to shutting down tenants.
+    let shutting_down = matches!(
+        e.downcast_ref::<PageReconstructError>(),
+        Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
+    );
+
+    if !shutting_down {
+        let tenant_shard_id = tenant.tenant_shard_id();
+        error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
+    }
+}
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -42,7 +42,6 @@
 //   reading these fields. We use the Debug impl for semi-structured logging, though.

 use std::{
-    collections::HashMap,
    sync::Arc,
    time::{Duration, SystemTime},
 };
@@ -125,7 +124,7 @@ pub fn launch_disk_usage_global_eviction_task(
 async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    _storage: &GenericRemoteStorage,
+    storage: &GenericRemoteStorage,
    tenants_dir: &Utf8Path,
    cancel: CancellationToken,
 ) {
@@ -149,8 +148,14 @@ async fn disk_usage_eviction_task(
        let start = Instant::now();

        async {
-            let res =
-                disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;
+            let res = disk_usage_eviction_task_iteration(
+                state,
+                task_config,
+                storage,
+                tenants_dir,
+                &cancel,
+            )
+            .await;

            match res {
                Ok(()) => {}
@@ -181,12 +186,13 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
 async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
+    storage: &GenericRemoteStorage,
    tenants_dir: &Utf8Path,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
+    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
    match res {
        Ok(outcome) => {
            debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -268,8 +274,9 @@ struct LayerCount {
    count: usize,
 }

-pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
+pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
+    _storage: &GenericRemoteStorage,
    usage_pre: U,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
@@ -321,16 +328,16 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // Walk through the list of candidates, until we have accumulated enough layers to get
    // us back under the pressure threshold. 'usage_planned' is updated so that it tracks
    // how much disk space would be used after evicting all the layers up to the current
-    // point in the list. The layers are collected in 'batched', grouped per timeline.
+    // point in the list.
    //
    // If we get far enough in the list that we start to evict layers that are below
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
-    let mut batched: HashMap<_, Vec<_>> = HashMap::new();
    let mut warned = None;
    let mut usage_planned = usage_pre;
-    let mut max_batch_size = 0;
-    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
+    let mut evicted_amount = 0;
+
+    for (i, (partition, candidate)) in candidates.iter().enumerate() {
        if !usage_planned.has_pressure() {
            debug!(
                no_candidates_evicted = i,
@@ -339,25 +346,13 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            break;
        }

-        if partition == MinResidentSizePartition::Below && warned.is_none() {
+        if partition == &MinResidentSizePartition::Below && warned.is_none() {
            warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
            warned = Some(usage_planned);
        }

        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
-
-        // FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
-        // tasks to evict all seen layers until we have evicted enough
-
-        let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
-
-        // semaphore will later be used to limit eviction concurrency, and we can express at
-        // most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
-        // but fail gracefully by not making batches larger.
-        if batch.len() < u32::MAX as usize {
-            batch.push(candidate.layer);
-            max_batch_size = max_batch_size.max(batch.len());
-        }
+        evicted_amount += 1;
    }

    let usage_planned = match warned {
@@ -372,100 +367,79 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    };
    debug!(?usage_planned, "usage planned");

-    // phase2: evict victims batched by timeline
+    // phase2: evict layers

    let mut js = tokio::task::JoinSet::new();
+    let limit = 1000;

-    // ratelimit to 1k files or any higher max batch size
-    let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
+    let mut evicted = candidates.into_iter().take(evicted_amount).fuse();
+    let mut consumed_all = false;

-    for (timeline, batch) in batched {
-        let tenant_shard_id = timeline.tenant_shard_id;
-        let timeline_id = timeline.timeline_id;
-        let batch_size =
-            u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
+    // After the evictions, `usage_assumed` is the post-eviction usage,
+    // according to internal accounting.
+    let mut usage_assumed = usage_pre;
+    let mut evictions_failed = LayerCount::default();

-        // I dislike naming of `available_permits` but it means current total amount of permits
-        // because permits can be added
-        assert!(batch_size as usize <= limit.available_permits());
+    let evict_layers = async move {
+        loop {
+            let next = if js.len() >= limit || consumed_all {
+                js.join_next().await
+            } else if !js.is_empty() {
+                // opportunistically consume ready result, one per each new evicted
+                futures::future::FutureExt::now_or_never(js.join_next()).and_then(|x| x)
+            } else {
+                None
+            };

-        debug!(%timeline_id, "evicting batch for timeline");
-
-        let evict = {
-            let limit = limit.clone();
-            let cancel = cancel.clone();
-            async move {
-                let mut evicted_bytes = 0;
-                let mut evictions_failed = LayerCount::default();
-
-                let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
-                    // semaphore closing means cancelled
-                    return (evicted_bytes, evictions_failed);
-                };
-
-                let results = timeline.evict_layers(&batch).await;
-
-                match results {
-                    Ok(results) => {
-                        assert_eq!(results.len(), batch.len());
-                        for (result, layer) in results.into_iter().zip(batch.iter()) {
-                            let file_size = layer.layer_desc().file_size;
-                            match result {
-                                Some(Ok(())) => {
-                                    evicted_bytes += file_size;
-                                }
-                                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
-                                    evictions_failed.file_sizes += file_size;
-                                    evictions_failed.count += 1;
-                                }
-                                None => {
-                                    assert!(cancel.is_cancelled());
-                                }
-                            }
-                        }
+            if let Some(next) = next {
+                match next {
+                    Ok(Ok(file_size)) => {
+                        usage_assumed.add_available_bytes(file_size);
                    }
-                    Err(e) => {
-                        warn!("failed to evict batch: {:#}", e);
+                    Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => {
+                        evictions_failed.file_sizes += file_size;
+                        evictions_failed.count += 1;
                    }
+                    Err(je) if je.is_cancelled() => unreachable!("not used"),
+                    Err(je) if je.is_panic() => { /* already logged */ }
+                    Err(je) => tracing::error!("unknown JoinError: {je:?}"),
                }
-                (evicted_bytes, evictions_failed)
            }
-        }
-        .instrument(tracing::info_span!("evict_batch", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, batch_size));

-        js.spawn(evict);
-
-        // spwaning multiple thousands of these is essentially blocking, so give already spawned a
-        // chance of making progress
-        tokio::task::yield_now().await;
-    }
-
-    let join_all = async move {
-        // After the evictions, `usage_assumed` is the post-eviction usage,
-        // according to internal accounting.
-        let mut usage_assumed = usage_pre;
-        let mut evictions_failed = LayerCount::default();
-
-        while let Some(res) = js.join_next().await {
-            match res {
-                Ok((evicted_bytes, failed)) => {
-                    usage_assumed.add_available_bytes(evicted_bytes);
-                    evictions_failed.file_sizes += failed.file_sizes;
-                    evictions_failed.count += failed.count;
-                }
-                Err(je) if je.is_cancelled() => unreachable!("not used"),
-                Err(je) if je.is_panic() => { /* already logged */ }
-                Err(je) => tracing::error!("unknown JoinError: {je:?}"),
+            if consumed_all && js.is_empty() {
+                break;
            }
+
+            // calling again when consumed_all is fine as evicted is fused.
+            let Some((_partition, candidate)) = evicted.next() else {
+                consumed_all = true;
+                continue;
+            };
+
+            js.spawn(async move {
+                let rtc = candidate.timeline.remote_client.as_ref().expect(
+                    "holding the witness, all timelines must have a remote timeline client",
+                );
+                let file_size = candidate.layer.layer_desc().file_size;
+                candidate
+                    .layer
+                    .evict_and_wait(rtc)
+                    .await
+                    .map(|()| file_size)
+                    .map_err(|e| (file_size, e))
+            });
+
+            tokio::task::yield_now().await;
        }
+
        (usage_assumed, evictions_failed)
    };

    let (usage_assumed, evictions_failed) = tokio::select! {
-        tuple = join_all => { tuple },
+        tuple = evict_layers => { tuple },
        _ = cancel.cancelled() => {
-            // close the semaphore to stop any pending acquires
-            limit.close();
+            // dropping joinset will abort all pending evict_and_waits and that is fine, our
+            // requests will still stand
            return Ok(IterationOutcome::Cancelled);
        }
    };
--- a/pageserver/src/http/mod.rs
+++ b/pageserver/src/http/mod.rs
@@ -1,4 +1,2 @@
 pub mod routes;
 pub use routes::make_router;
-
-pub use pageserver_api::models;
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -992,8 +992,8 @@ paths:
          type: string
    post:
      description: |
-        Create a timeline. Returns new timeline id on success.\
-        If no new timeline id is specified in parameters, it would be generated. It's an error to recreate the same timeline.
+        Create a timeline. Returns new timeline id on success.
+        Recreating the same timeline will succeed if the parameters match the existing timeline.
        If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver.
      requestBody:
        content:
@@ -1405,6 +1405,8 @@ components:
          type: integer
        trace_read_requests:
          type: boolean
+        heatmap_period:
+          type: integer
    TenantConfigResponse:
      type: object
      properties:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -28,20 +28,18 @@ use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};

-use super::models::{
-    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
-    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
-};
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
+use crate::tenant::mgr::GetActiveTenantError;
 use crate::tenant::mgr::{
    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
    TenantSlotError, TenantSlotUpsertError, TenantStateError,
 };
+use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::timeline::CompactFlags;
@@ -49,6 +47,10 @@ use crate::tenant::timeline::Timeline;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
+use pageserver_api::models::{
+    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
+    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
+};
 use utils::{
    auth::SwappableJwtAuth,
    generation::Generation,
@@ -64,7 +66,12 @@ use utils::{
 };

 // Imports only used for testing APIs
-use super::models::ConfigureFailpointsRequest;
+use pageserver_api::models::ConfigureFailpointsRequest;
+
+// For APIs that require an Active tenant, how long should we block waiting for that state?
+// This is not functionally necessary (clients will retry), but avoids generating a lot of
+// failed API calls while tenants are activating.
+const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);

 pub struct State {
    conf: &'static PageServerConf,
@@ -75,9 +82,11 @@ pub struct State {
    broker_client: storage_broker::BrokerClientChannel,
    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
    deletion_queue_client: DeletionQueueClient,
+    secondary_controller: SecondaryController,
 }

 impl State {
+    #[allow(clippy::too_many_arguments)]
    pub fn new(
        conf: &'static PageServerConf,
        tenant_manager: Arc<TenantManager>,
@@ -86,6 +95,7 @@ impl State {
        broker_client: storage_broker::BrokerClientChannel,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
        deletion_queue_client: DeletionQueueClient,
+        secondary_controller: SecondaryController,
    ) -> anyhow::Result<Self> {
        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"]
            .iter()
@@ -100,6 +110,7 @@ impl State {
            broker_client,
            disk_usage_eviction_state,
            deletion_queue_client,
+            secondary_controller,
        })
    }

@@ -136,11 +147,6 @@ impl From<PageReconstructError> for ApiError {
    fn from(pre: PageReconstructError) -> ApiError {
        match pre {
            PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
-            PageReconstructError::NeedsDownload(_, _) => {
-                // This shouldn't happen, because we use a RequestContext that requests to
-                // download any missing layer files on-demand.
-                ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file"))
-            }
            PageReconstructError::Cancelled => {
                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
            }
@@ -233,6 +239,19 @@ impl From<GetTenantError> for ApiError {
    }
 }

+impl From<GetActiveTenantError> for ApiError {
+    fn from(e: GetActiveTenantError) -> ApiError {
+        match e {
+            GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
+            GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
+            GetActiveTenantError::NotFound(gte) => gte.into(),
+            GetActiveTenantError::WaitForActiveTimeout { .. } => {
+                ApiError::ResourceUnavailable(format!("{}", e).into())
+            }
+        }
+    }
+}
+
 impl From<SetNewTenantConfigError> for ApiError {
    fn from(e: SetNewTenantConfigError) -> ApiError {
        match e {
@@ -435,7 +454,10 @@ async fn timeline_create_handler(
    let state = get_state(&request);

    async {
-        let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, true)?;
+        let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
        match tenant.create_timeline(
            new_timeline_id,
            request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -453,7 +475,7 @@ async fn timeline_create_handler(
                    .map_err(ApiError::InternalServerError)?;
                json_response(StatusCode::CREATED, timeline_info)
            }
-            Err(tenant::CreateTimelineError::AlreadyExists) => {
+            Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => {
                json_response(StatusCode::CONFLICT, ())
            }
            Err(tenant::CreateTimelineError::AncestorLsn(err)) => {
@@ -694,11 +716,23 @@ async fn timeline_delete_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
    let state = get_state(&request);

-    state.tenant_manager.delete_timeline(tenant_shard_id, timeline_id, &ctx)
-        .instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id, false)
+        .map_err(|e| {
+            match e {
+                // GetTenantError has a built-in conversion to ApiError, but in this context we don't
+                // want to treat missing tenants as 404, to avoid ambiguity with successful deletions.
+                GetTenantError::NotFound(_) => ApiError::PreconditionFailed(
+                    "Requested tenant is missing".to_string().into_boxed_str(),
+                ),
+                e => e.into(),
+            }
+        })?;
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+    tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
        .await?;

    json_response(StatusCode::ACCEPTED, ())
@@ -1136,7 +1170,10 @@ async fn tenant_create_handler(

    // We created the tenant. Existing API semantics are that the tenant
    // is Active when this function returns.
-    if let res @ Err(_) = new_tenant.wait_to_become_active().await {
+    if let res @ Err(_) = new_tenant
+        .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
+        .await
+    {
        // This shouldn't happen because we just created the tenant directory
        // in tenant::mgr::create_tenant, and there aren't any remote timelines
        // to load, so, nothing can really fail during load.
@@ -1487,69 +1524,6 @@ async fn timeline_collect_keyspace(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

-    struct Partitioning {
-        keys: crate::keyspace::KeySpace,
-
-        at_lsn: Lsn,
-    }
-
-    impl serde::Serialize for Partitioning {
-        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            use serde::ser::SerializeMap;
-            let mut map = serializer.serialize_map(Some(2))?;
-            map.serialize_key("keys")?;
-            map.serialize_value(&KeySpace(&self.keys))?;
-            map.serialize_key("at_lsn")?;
-            map.serialize_value(&WithDisplay(&self.at_lsn))?;
-            map.end()
-        }
-    }
-
-    struct WithDisplay<'a, T>(&'a T);
-
-    impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
-        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            serializer.collect_str(&self.0)
-        }
-    }
-
-    struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
-
-    impl<'a> serde::Serialize for KeySpace<'a> {
-        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            use serde::ser::SerializeSeq;
-            let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
-            for kr in &self.0.ranges {
-                seq.serialize_element(&KeyRange(kr))?;
-            }
-            seq.end()
-        }
-    }
-
-    struct KeyRange<'a>(&'a std::ops::Range<crate::repository::Key>);
-
-    impl<'a> serde::Serialize for KeyRange<'a> {
-        fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            use serde::ser::SerializeTuple;
-            let mut t = serializer.serialize_tuple(2)?;
-            t.serialize_element(&WithDisplay(&self.0.start))?;
-            t.serialize_element(&WithDisplay(&self.0.end))?;
-            t.end()
-        }
-    }
-
    let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;

    async {
@@ -1561,7 +1535,9 @@ async fn timeline_collect_keyspace(
            .await
            .map_err(|e| ApiError::InternalServerError(e.into()))?;

-        json_response(StatusCode::OK, Partitioning { keys, at_lsn })
+        let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn };
+
+        json_response(StatusCode::OK, res)
    }
    .instrument(info_span!("timeline_collect_keyspace", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
    .await
@@ -1593,7 +1569,7 @@ async fn always_panic_handler(

 async fn disk_usage_eviction_run(
    mut r: Request<Body>,
-    _cancel: CancellationToken,
+    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    check_permission(&r, None)?;

@@ -1621,57 +1597,48 @@ async fn disk_usage_eviction_run(
        }
    }

-    let config = json_request::<Config>(&mut r)
-        .await
-        .map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?;
+    let config = json_request::<Config>(&mut r).await?;

    let usage = Usage {
        config,
        freed_bytes: 0,
    };

-    let (tx, rx) = tokio::sync::oneshot::channel();
-
    let state = get_state(&r);

-    if state.remote_storage.as_ref().is_none() {
+    let Some(storage) = state.remote_storage.as_ref() else {
        return Err(ApiError::InternalServerError(anyhow::anyhow!(
            "remote storage not configured, cannot run eviction iteration"
        )));
-    }
+    };

    let state = state.disk_usage_eviction_state.clone();

-    let cancel = CancellationToken::new();
-    let child_cancel = cancel.clone();
-    let _g = cancel.drop_guard();
+    let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
+        &state, storage, usage, &cancel,
+    )
+    .await;

-    crate::task_mgr::spawn(
-        crate::task_mgr::BACKGROUND_RUNTIME.handle(),
-        TaskKind::DiskUsageEviction,
-        None,
-        None,
-        "ondemand disk usage eviction",
-        false,
-        async move {
-            let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
-                &state,
-                usage,
-                &child_cancel,
-            )
-            .await;
+    info!(?res, "disk_usage_eviction_task_iteration_impl finished");

-            info!(?res, "disk_usage_eviction_task_iteration_impl finished");
+    let res = res.map_err(ApiError::InternalServerError)?;

-            let _ = tx.send(res);
-            Ok(())
-        }
-        .in_current_span(),
-    );
+    json_response(StatusCode::OK, res)
+}

-    let response = rx.await.unwrap().map_err(ApiError::InternalServerError)?;
+async fn secondary_upload_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&request);
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    state
+        .secondary_controller
+        .upload_tenant(tenant_shard_id)
+        .await
+        .map_err(ApiError::InternalServerError)?;

-    json_response(StatusCode::OK, response)
+    json_response(StatusCode::OK, ())
 }

 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -1933,6 +1900,9 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
            |r| api_handler(r, evict_timeline_layer_handler),
        )
+        .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
+            api_handler(r, secondary_upload_handler)
+        })
        .put("/v1/disk_usage_eviction/run", |r| {
            api_handler(r, disk_usage_eviction_run)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -10,7 +10,7 @@ pub mod deletion_queue;
 pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
-pub mod keyspace;
+pub use pageserver_api::keyspace;
 pub mod metrics;
 pub mod page_cache;
 pub mod page_service;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2,9 +2,10 @@ use enum_map::EnumMap;
 use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
-    register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
-    register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram,
-    HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+    register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
+    register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
+    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPairVec,
+    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
@@ -683,14 +684,54 @@ pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
    .expect("Failed to register pageserver_startup_is_loading")
 });

-/// How long did tenants take to go from construction to active state?
-pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
+/// like how long it took to load.
+///
+/// Note that these are process-global metrics, _not_ per-tenant metrics.  Per-tenant
+/// metrics are rather expensive, and usually fine grained stuff makes more sense
+/// at a timeline level than tenant level.
+pub(crate) struct TenantMetrics {
+    /// How long did tenants take to go from construction to active state?
+    pub(crate) activation: Histogram,
+    pub(crate) preload: Histogram,
+    pub(crate) attach: Histogram,
+
+    /// How many tenants are included in the initial startup of the pagesrever?
+    pub(crate) startup_scheduled: IntCounter,
+    pub(crate) startup_complete: IntCounter,
+}
+
+pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
+    TenantMetrics {
+    activation: register_histogram!(
        "pageserver_tenant_activation_seconds",
        "Time taken by tenants to activate, in seconds",
        CRITICAL_OP_BUCKETS.into()
    )
-    .expect("Failed to register pageserver_tenant_activation_seconds metric")
+    .expect("Failed to register metric"),
+    preload: register_histogram!(
+        "pageserver_tenant_preload_seconds",
+        "Time taken by tenants to load remote metadata on startup/attach, in seconds",
+        CRITICAL_OP_BUCKETS.into()
+    )
+    .expect("Failed to register metric"),
+    attach: register_histogram!(
+        "pageserver_tenant_attach_seconds",
+        "Time taken by tenants to intialize, after remote metadata is already loaded",
+        CRITICAL_OP_BUCKETS.into()
+    )
+    .expect("Failed to register metric"),
+    startup_scheduled: register_int_counter!(
+        "pageserver_tenant_startup_scheduled",
+        "Number of tenants included in pageserver startup (doesn't count tenants attached later)"
+    ).expect("Failed to register metric"),
+    startup_complete: register_int_counter!(
+        "pageserver_tenant_startup_complete",
+        "Number of tenants that have completed warm-up, or activated on-demand during initial startup: \
+         should eventually reach `pageserver_tenant_startup_scheduled_total`.  Does not include broken \
+         tenants: such cases will lead to this metric never reaching the scheduled count."
+    ).expect("Failed to register metric"),
+}
 });

 /// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
@@ -1270,6 +1311,28 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
    )
    .expect("failed to define a metric"),
 });
+pub(crate) struct SecondaryModeMetrics {
+    pub(crate) upload_heatmap: IntCounter,
+    pub(crate) upload_heatmap_errors: IntCounter,
+    pub(crate) upload_heatmap_duration: Histogram,
+}
+pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
+    upload_heatmap: register_int_counter!(
+        "pageserver_secondary_upload_heatmap",
+        "Number of heatmaps written to remote storage by attached tenants"
+    )
+    .expect("failed to define a metric"),
+    upload_heatmap_errors: register_int_counter!(
+        "pageserver_secondary_upload_heatmap_errors",
+        "Failures writing heatmap to remote storage"
+    )
+    .expect("failed to define a metric"),
+    upload_heatmap_duration: register_histogram!(
+        "pageserver_secondary_upload_heatmap_duration",
+        "Time to build and upload a heatmap, including any waiting inside the S3 client"
+    )
+    .expect("failed to define a metric"),
+});

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
@@ -1321,25 +1384,16 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

-pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy<IntCounterVec> =
-    Lazy::new(|| {
-        register_int_counter_vec!(
-            "pageserver_background_loop_semaphore_wait_start_count",
-            "Counter for background loop concurrency-limiting semaphore acquire calls started",
-            &["task"],
-        )
-        .unwrap()
-    });
-
-pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy<IntCounterVec> =
-    Lazy::new(|| {
-        register_int_counter_vec!(
-            "pageserver_background_loop_semaphore_wait_finish_count",
-            "Counter for background loop concurrency-limiting semaphore acquire calls finished",
-            &["task"],
-        )
-        .unwrap()
-    });
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "pageserver_background_loop_semaphore_wait_start_count",
+        "Counter for background loop concurrency-limiting semaphore acquire calls started",
+        "pageserver_background_loop_semaphore_wait_finish_count",
+        "Counter for background loop concurrency-limiting semaphore acquire calls finished",
+        &["task"],
+    )
+    .unwrap()
+});

 pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
@@ -2199,6 +2253,9 @@ pub fn preinitialize_metrics() {
    // Deletion queue stats
    Lazy::force(&DELETION_QUEUE);

+    // Tenant stats
+    Lazy::force(&TENANT);
+
    // Tenant manager stats
    Lazy::force(&TENANT_MANAGER);

--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -2,38 +2,11 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::Result;
 use bytes::Bytes;
 use serde::{Deserialize, Serialize};
-use std::ops::{AddAssign, Range};
+use std::ops::AddAssign;
 use std::time::Duration;

 pub use pageserver_api::key::{Key, KEY_SIZE};

-pub fn key_range_size(key_range: &Range<Key>) -> u32 {
-    let start = key_range.start;
-    let end = key_range.end;
-
-    if end.field1 != start.field1
-        || end.field2 != start.field2
-        || end.field3 != start.field3
-        || end.field4 != start.field4
-    {
-        return u32::MAX;
-    }
-
-    let start = (start.field5 as u64) << 32 | start.field6 as u64;
-    let end = (end.field5 as u64) << 32 | end.field6 as u64;
-
-    let diff = end - start;
-    if diff > u32::MAX as u64 {
-        u32::MAX
-    } else {
-        diff as u32
-    }
-}
-
-pub fn singleton_range(key: Key) -> Range<Key> {
-    key..key.next()
-}
-
 /// A 'value' stored for a one Key.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 #[cfg_attr(test, derive(PartialEq))]
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -258,6 +258,9 @@ pub enum TaskKind {
    /// See [`crate::disk_usage_eviction_task`].
    DiskUsageEviction,

+    /// See [`crate::tenant::secondary`].
+    SecondaryUploads,
+
    // Initial logical size calculation
    InitialLogicalSizeCalculation,

@@ -558,9 +561,14 @@ pub async fn shutdown_watcher() {
 /// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or
 /// `tokio::task::JoinSet::spawn`.
 pub fn shutdown_token() -> CancellationToken {
-    SHUTDOWN_TOKEN
-        .try_with(|t| t.clone())
-        .expect("shutdown_token() called in an unexpected task or thread")
+    let res = SHUTDOWN_TOKEN.try_with(|t| t.clone());
+
+    if cfg!(test) {
+        // in tests this method is called from non-taskmgr spawned tasks, and that is all ok.
+        res.unwrap_or_default()
+    } else {
+        res.expect("shutdown_token() called in an unexpected task or thread")
+    }
 }

 /// Has the current task been requested to shut down?
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -36,6 +36,8 @@ use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext;
 use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
+use utils::timeout::timeout_cancellable;
+use utils::timeout::TimeoutCancellableError;

 use self::config::AttachedLocationConfig;
 use self::config::AttachmentMode;
@@ -48,6 +50,7 @@ use self::mgr::GetActiveTenantError;
 use self::mgr::GetTenantError;
 use self::mgr::TenantsMap;
 use self::remote_timeline_client::RemoteTimelineClient;
+use self::timeline::uninit::TimelineExclusionError;
 use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
@@ -58,7 +61,7 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
 use crate::is_uninit_mark;
-use crate::metrics::TENANT_ACTIVATION;
+use crate::metrics::TENANT;
 use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
 use crate::repository::GcResult;
 use crate::task_mgr;
@@ -87,7 +90,6 @@ use std::process::Stdio;
 use std::sync::atomic::AtomicU64;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
-use std::sync::MutexGuard;
 use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};

@@ -144,6 +146,7 @@ pub mod storage_layer;
 pub mod config;
 pub mod delete;
 pub mod mgr;
+pub mod secondary;
 pub mod tasks;
 pub mod upload_queue;

@@ -225,7 +228,7 @@ pub struct Tenant {

    /// The value creation timestamp, used to measure activation delay, see:
    /// <https://github.com/neondatabase/neon/issues/4025>
-    loading_started_at: Instant,
+    constructed_at: Instant,

    state: watch::Sender<TenantState>,

@@ -248,6 +251,12 @@ pub struct Tenant {
    generation: Generation,

    timelines: Mutex<HashMap<TimelineId, Arc<Timeline>>>,
+
+    /// During timeline creation, we first insert the TimelineId to the
+    /// creating map, then `timelines`, then remove it from the creating map.
+    /// **Lock order**: if acquring both, acquire`timelines` before `timelines_creating`
+    timelines_creating: std::sync::Mutex<HashSet<TimelineId>>,
+
    // This mutex prevents creation of new timelines during GC.
    // Adding yet another mutex (in addition to `timelines`) is needed because holding
    // `timelines` mutex during all GC iteration
@@ -269,6 +278,11 @@ pub struct Tenant {

    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,

+    /// If the tenant is in Activating state, notify this to encourage it
+    /// to proceed to Active as soon as possible, rather than waiting for lazy
+    /// background warmup.
+    pub(crate) activate_now_sem: tokio::sync::Semaphore,
+
    pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,

    // Cancellation token fires when we have entered shutdown().  This is a parent of
@@ -406,8 +420,10 @@ impl Debug for SetStoppingError {

 #[derive(thiserror::Error, Debug)]
 pub enum CreateTimelineError {
-    #[error("a timeline with the given ID already exists")]
-    AlreadyExists,
+    #[error("creation of timeline with the given ID is in progress")]
+    AlreadyCreating,
+    #[error("timeline already exists with different parameters")]
+    Conflict,
    #[error(transparent)]
    AncestorLsn(anyhow::Error),
    #[error("ancestor timeline is not active")]
@@ -613,6 +629,14 @@ impl Tenant {
            "attach tenant",
            false,
            async move {
+                // Is this tenant being spawned as part of process startup?
+                let starting_up = init_order.is_some();
+                scopeguard::defer! {
+                    if starting_up {
+                        TENANT.startup_complete.inc();
+                    }
+                }
+
                // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
                let make_broken =
                    |t: &Tenant, err: anyhow::Error| {
@@ -639,8 +663,62 @@ impl Tenant {
                    .as_mut()
                    .and_then(|x| x.initial_tenant_load_remote.take());

+                enum AttachType<'a> {
+                    // During pageserver startup, we are attaching this tenant lazily in the background
+                    Warmup(tokio::sync::SemaphorePermit<'a>),
+                    // During pageserver startup, we are attaching this tenant as soon as we can,
+                    // because a client tried to access it.
+                    OnDemand,
+                    // During normal operations after startup, we are attaching a tenant.
+                    Normal,
+                }
+
+                // Before doing any I/O, wait for either or:
+                // - A client to attempt to access to this tenant (on-demand loading)
+                // - A permit to become available in the warmup semaphore (background warmup)
+                //
+                // Some-ness of init_order is how we know if we're attaching during startup or later
+                // in process lifetime.
+                let attach_type = if init_order.is_some() {
+                    tokio::select!(
+                        _ = tenant_clone.activate_now_sem.acquire() => {
+                            tracing::info!("Activating tenant (on-demand)");
+                            AttachType::OnDemand
+                        },
+                        permit_result = conf.concurrent_tenant_warmup.inner().acquire() => {
+                            match permit_result {
+                                Ok(p) => {
+                                    tracing::info!("Activating tenant (warmup)");
+                                    AttachType::Warmup(p)
+                                }
+                                Err(_) => {
+                                    // This is unexpected: the warmup semaphore should stay alive
+                                    // for the lifetime of init_order.  Log a warning and proceed.
+                                    tracing::warn!("warmup_limit semaphore unexpectedly closed");
+                                    AttachType::Normal
+                                }
+                            }
+
+                        }
+                        _ = tenant_clone.cancel.cancelled() => {
+                            // This is safe, but should be pretty rare: it is interesting if a tenant
+                            // stayed in Activating for such a long time that shutdown found it in
+                            // that state.
+                            tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation");
+                            return Ok(());
+                        },
+                    )
+                } else {
+                    AttachType::Normal
+                };
+
+                let preload_timer = TENANT.preload.start_timer();
                let preload = match mode {
-                    SpawnMode::Create => {None},
+                    SpawnMode::Create => {
+                        // Don't count the skipped preload into the histogram of preload durations
+                        preload_timer.stop_and_discard();
+                        None
+                    },
                    SpawnMode::Normal => {
                        match &remote_storage {
                            Some(remote_storage) => Some(
@@ -650,7 +728,11 @@ impl Tenant {
                                        tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()),
                                    )
                                    .await {
-                                        Ok(p) => p,
+                                        Ok(p) => {
+                                            preload_timer.observe_duration();
+                                            p
+                                        }
+                                            ,
                                        Err(e) => {
                                            make_broken(&tenant_clone, anyhow::anyhow!(e));
                                                return Ok(());
@@ -712,15 +794,43 @@ impl Tenant {
                    }
                }

+                // We will time the duration of the attach phase unless this is a creation (attach will do no work)
+                let attach_timer = match mode {
+                    SpawnMode::Create => None,
+                    SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
+                };
                match tenant_clone.attach(preload, &ctx).await {
                    Ok(()) => {
                        info!("attach finished, activating");
+                        if let Some(t)=  attach_timer {t.observe_duration();}
                        tenant_clone.activate(broker_client, None, &ctx);
                    }
                    Err(e) => {
+                        if let Some(t)=  attach_timer {t.observe_duration();}
                        make_broken(&tenant_clone, anyhow::anyhow!(e));
                    }
                }
+
+                // If we are doing an opportunistic warmup attachment at startup, initialize
+                // logical size at the same time.  This is better than starting a bunch of idle tenants
+                // with cold caches and then coming back later to initialize their logical sizes.
+                //
+                // It also prevents the warmup proccess competing with the concurrency limit on
+                // logical size calculations: if logical size calculation semaphore is saturated,
+                // then warmup will wait for that before proceeding to the next tenant.
+                if let AttachType::Warmup(_permit) = attach_type {
+                    let mut futs = FuturesUnordered::new();
+                    let timelines: Vec<_> = tenant_clone.timelines.lock().unwrap().values().cloned().collect();
+                    for t in timelines {
+                        futs.push(t.await_initial_logical_size())
+                    }
+                    tracing::info!("Waiting for initial logical sizes while warming up...");
+                    while futs.next().await.is_some() {
+
+                    }
+                    tracing::info!("Warm-up complete");
+                }
+
                Ok(())
            }
            .instrument({
@@ -1457,7 +1567,7 @@ impl Tenant {
    /// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
    /// minimum amount of keys required to get a writable timeline.
    /// (Without it, `put` might fail due to `repartition` failing.)
-    pub async fn create_empty_timeline(
+    pub(crate) async fn create_empty_timeline(
        &self,
        new_timeline_id: TimelineId,
        initdb_lsn: Lsn,
@@ -1469,10 +1579,7 @@ impl Tenant {
            "Cannot create empty timelines on inactive tenant"
        );

-        let timeline_uninit_mark = {
-            let timelines = self.timelines.lock().unwrap();
-            self.create_timeline_uninit_mark(new_timeline_id, &timelines)?
-        };
+        let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id)?;
        let new_metadata = TimelineMetadata::new(
            // Initialize disk_consistent LSN to 0, The caller must import some data to
            // make it valid, before calling finish_creation()
@@ -1549,7 +1656,7 @@ impl Tenant {
    /// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with
    /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
    #[allow(clippy::too_many_arguments)]
-    pub async fn create_timeline(
+    pub(crate) async fn create_timeline(
        &self,
        new_timeline_id: TimelineId,
        ancestor_timeline_id: Option<TimelineId>,
@@ -1570,26 +1677,51 @@ impl Tenant {
            .enter()
            .map_err(|_| CreateTimelineError::ShuttingDown)?;

-        if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
-            debug!("timeline {new_timeline_id} already exists");
-
-            if let Some(remote_client) = existing.remote_client.as_ref() {
-                // Wait for uploads to complete, so that when we return Ok, the timeline
-                // is known to be durable on remote storage. Just like we do at the end of
-                // this function, after we have created the timeline ourselves.
-                //
-                // We only really care that the initial version of `index_part.json` has
-                // been uploaded. That's enough to remember that the timeline
-                // exists. However, there is no function to wait specifically for that so
-                // we just wait for all in-progress uploads to finish.
-                remote_client
-                    .wait_completion()
-                    .await
-                    .context("wait for timeline uploads to complete")?;
+        // Get exclusive access to the timeline ID: this ensures that it does not already exist,
+        // and that no other creation attempts will be allowed in while we are working.  The
+        // uninit_mark is a guard.
+        let uninit_mark = match self.create_timeline_uninit_mark(new_timeline_id) {
+            Ok(m) => m,
+            Err(TimelineExclusionError::AlreadyCreating) => {
+                // Creation is in progress, we cannot create it again, and we cannot
+                // check if this request matches the existing one, so caller must try
+                // again later.
+                return Err(CreateTimelineError::AlreadyCreating);
            }
+            Err(TimelineExclusionError::Other(e)) => {
+                return Err(CreateTimelineError::Other(e));
+            }
+            Err(TimelineExclusionError::AlreadyExists(existing)) => {
+                debug!("timeline {new_timeline_id} already exists");

-            return Err(CreateTimelineError::AlreadyExists);
-        }
+                // Idempotency: creating the same timeline twice is not an error, unless
+                // the second creation has different parameters.
+                if existing.get_ancestor_timeline_id() != ancestor_timeline_id
+                    || existing.pg_version != pg_version
+                    || (ancestor_start_lsn.is_some()
+                        && ancestor_start_lsn != Some(existing.get_ancestor_lsn()))
+                {
+                    return Err(CreateTimelineError::Conflict);
+                }
+
+                if let Some(remote_client) = existing.remote_client.as_ref() {
+                    // Wait for uploads to complete, so that when we return Ok, the timeline
+                    // is known to be durable on remote storage. Just like we do at the end of
+                    // this function, after we have created the timeline ourselves.
+                    //
+                    // We only really care that the initial version of `index_part.json` has
+                    // been uploaded. That's enough to remember that the timeline
+                    // exists. However, there is no function to wait specifically for that so
+                    // we just wait for all in-progress uploads to finish.
+                    remote_client
+                        .wait_completion()
+                        .await
+                        .context("wait for timeline uploads to complete")?;
+                }
+
+                return Ok(existing);
+            }
+        };

        let loaded_timeline = match ancestor_timeline_id {
            Some(ancestor_timeline_id) => {
@@ -1626,18 +1758,32 @@ impl Tenant {
                    ancestor_timeline.wait_lsn(*lsn, ctx).await?;
                }

-                self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx)
-                    .await?
+                self.branch_timeline(
+                    &ancestor_timeline,
+                    new_timeline_id,
+                    ancestor_start_lsn,
+                    uninit_mark,
+                    ctx,
+                )
+                .await?
            }
            None => {
-                self.bootstrap_timeline(new_timeline_id, pg_version, load_existing_initdb, ctx)
-                    .await?
+                self.bootstrap_timeline(
+                    new_timeline_id,
+                    pg_version,
+                    load_existing_initdb,
+                    uninit_mark,
+                    ctx,
+                )
+                .await?
            }
        };

+        // At this point we have dropped our guard on [`Self::timelines_creating`], and
+        // the timeline is visible in [`Self::timelines`], but it is _not_ durable yet.  We must
+        // not send a success to the caller until it is.  The same applies to handling retries,
+        // see the handling of [`TimelineExclusionError::AlreadyExists`] above.
        if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
-            // Wait for the upload of the 'index_part.json` file to finish, so that when we return
-            // Ok, the timeline is durable in remote storage.
            let kind = ancestor_timeline_id
                .map(|_| "branched")
                .unwrap_or("bootstrapped");
@@ -1651,6 +1797,15 @@ impl Tenant {
        Ok(loaded_timeline)
    }

+    pub(crate) async fn delete_timeline(
+        self: Arc<Self>,
+        timeline_id: TimelineId,
+    ) -> Result<(), DeleteTimelineError> {
+        DeleteTimelineFlow::run(&self, timeline_id, false).await?;
+
+        Ok(())
+    }
+
    /// perform one garbage collection iteration, removing old data files from disk.
    /// this function is periodically called by gc task.
    /// also it can be explicitly requested through page server api 'do_gc' command.
@@ -1812,7 +1967,7 @@ impl Tenant {
                );
                *current_state = TenantState::Active;

-                let elapsed = self.loading_started_at.elapsed();
+                let elapsed = self.constructed_at.elapsed();
                let total_timelines = timelines_accessor.len();

                // log a lot of stuff, because some tenants sometimes suffer from user-visible
@@ -1827,7 +1982,7 @@ impl Tenant {
                    "activation attempt finished"
                );

-                TENANT_ACTIVATION.observe(elapsed.as_secs_f64());
+                TENANT.activation.observe(elapsed.as_secs_f64());
            });
        }
    }
@@ -2082,18 +2237,41 @@ impl Tenant {
        self.state.subscribe()
    }

-    pub(crate) async fn wait_to_become_active(&self) -> Result<(), GetActiveTenantError> {
+    /// The activate_now semaphore is initialized with zero units.  As soon as
+    /// we add a unit, waiters will be able to acquire a unit and proceed.
+    pub(crate) fn activate_now(&self) {
+        self.activate_now_sem.add_permits(1);
+    }
+
+    pub(crate) async fn wait_to_become_active(
+        &self,
+        timeout: Duration,
+    ) -> Result<(), GetActiveTenantError> {
        let mut receiver = self.state.subscribe();
        loop {
            let current_state = receiver.borrow_and_update().clone();
            match current_state {
                TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
                    // in these states, there's a chance that we can reach ::Active
-                    receiver.changed().await.map_err(
-                        |_e: tokio::sync::watch::error::RecvError|
-                            // Tenant existed but was dropped: report it as non-existent
-                            GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id))
-                    )?;
+                    self.activate_now();
+                    match timeout_cancellable(timeout, &self.cancel, receiver.changed()).await {
+                        Ok(r) => {
+                            r.map_err(
+                            |_e: tokio::sync::watch::error::RecvError|
+                                // Tenant existed but was dropped: report it as non-existent
+                                GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id))
+                        )?
+                        }
+                        Err(TimeoutCancellableError::Cancelled) => {
+                            return Err(GetActiveTenantError::Cancelled);
+                        }
+                        Err(TimeoutCancellableError::Timeout) => {
+                            return Err(GetActiveTenantError::WaitForActiveTimeout {
+                                latest_state: Some(self.current_state()),
+                                wait_time: timeout,
+                            });
+                        }
+                    }
                }
                TenantState::Active { .. } => {
                    return Ok(());
@@ -2114,6 +2292,14 @@ impl Tenant {
            .attach_mode
            .clone()
    }
+
+    pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
+        &self.tenant_shard_id
+    }
+
+    pub(crate) fn get_generation(&self) -> Generation {
+        self.generation
+    }
 }

 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
@@ -2252,6 +2438,18 @@ impl Tenant {
            .or(self.conf.default_tenant_conf.min_resident_size_override)
    }

+    pub fn get_heatmap_period(&self) -> Option<Duration> {
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let heatmap_period = tenant_conf
+            .heatmap_period
+            .unwrap_or(self.conf.default_tenant_conf.heatmap_period);
+        if heatmap_period.is_zero() {
+            None
+        } else {
+            Some(heatmap_period)
+        }
+    }
+
    pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
        self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
        // Don't hold self.timelines.lock() during the notifies.
@@ -2398,9 +2596,10 @@ impl Tenant {
            conf,
            // using now here is good enough approximation to catch tenants with really long
            // activation times.
-            loading_started_at: Instant::now(),
+            constructed_at: Instant::now(),
            tenant_conf: Arc::new(RwLock::new(attached_conf)),
            timelines: Mutex::new(HashMap::new()),
+            timelines_creating: Mutex::new(HashSet::new()),
            gc_cs: tokio::sync::Mutex::new(()),
            walredo_mgr,
            remote_storage,
@@ -2409,6 +2608,7 @@ impl Tenant {
            cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
            eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
+            activate_now_sem: tokio::sync::Semaphore::new(0),
            delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
            cancel: CancellationToken::default(),
            gate: Gate::new(format!("Tenant<{tenant_shard_id}>")),
@@ -2792,8 +2992,9 @@ impl Tenant {
        start_lsn: Option<Lsn>,
        ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
+        let uninit_mark = self.create_timeline_uninit_mark(dst_id).unwrap();
        let tl = self
-            .branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
+            .branch_timeline_impl(src_timeline, dst_id, start_lsn, uninit_mark, ctx)
            .await?;
        tl.set_state(TimelineState::Active);
        Ok(tl)
@@ -2807,9 +3008,10 @@ impl Tenant {
        src_timeline: &Arc<Timeline>,
        dst_id: TimelineId,
        start_lsn: Option<Lsn>,
+        timeline_uninit_mark: TimelineUninitMark<'_>,
        ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
-        self.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
+        self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_uninit_mark, ctx)
            .await
    }

@@ -2818,13 +3020,14 @@ impl Tenant {
        src_timeline: &Arc<Timeline>,
        dst_id: TimelineId,
        start_lsn: Option<Lsn>,
+        timeline_uninit_mark: TimelineUninitMark<'_>,
        _ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
        let src_id = src_timeline.timeline_id;

-        // First acquire the GC lock so that another task cannot advance the GC
-        // cutoff in 'gc_info', and make 'start_lsn' invalid, while we are
-        // creating the branch.
+        // We will validate our ancestor LSN in this function.  Acquire the GC lock so that
+        // this check cannot race with GC, and the ancestor LSN is guaranteed to remain
+        // valid while we are creating the branch.
        let _gc_cs = self.gc_cs.lock().await;

        // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
@@ -2834,13 +3037,6 @@ impl Tenant {
            lsn
        });

-        // Create a placeholder for the new branch. This will error
-        // out if the new timeline ID is already in use.
-        let timeline_uninit_mark = {
-            let timelines = self.timelines.lock().unwrap();
-            self.create_timeline_uninit_mark(dst_id, &timelines)?
-        };
-
        // Ensure that `start_lsn` is valid, i.e. the LSN is within the PITR
        // horizon on the source timeline
        //
@@ -2932,21 +3128,38 @@ impl Tenant {
        Ok(new_timeline)
    }

-    /// - run initdb to init temporary instance and get bootstrap data
-    /// - after initialization completes, tar up the temp dir and upload it to S3.
-    ///
-    /// The caller is responsible for activating the returned timeline.
-    pub(crate) async fn bootstrap_timeline(
+    /// For unit tests, make this visible so that other modules can directly create timelines
+    #[cfg(test)]
+    pub(crate) async fn bootstrap_timeline_test(
        &self,
        timeline_id: TimelineId,
        pg_version: u32,
        load_existing_initdb: Option<TimelineId>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
-        let timeline_uninit_mark = {
-            let timelines = self.timelines.lock().unwrap();
-            self.create_timeline_uninit_mark(timeline_id, &timelines)?
-        };
+        let uninit_mark = self.create_timeline_uninit_mark(timeline_id).unwrap();
+        self.bootstrap_timeline(
+            timeline_id,
+            pg_version,
+            load_existing_initdb,
+            uninit_mark,
+            ctx,
+        )
+        .await
+    }
+
+    /// - run initdb to init temporary instance and get bootstrap data
+    /// - after initialization completes, tar up the temp dir and upload it to S3.
+    ///
+    /// The caller is responsible for activating the returned timeline.
+    async fn bootstrap_timeline(
+        &self,
+        timeline_id: TimelineId,
+        pg_version: u32,
+        load_existing_initdb: Option<TimelineId>,
+        timeline_uninit_mark: TimelineUninitMark<'_>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Arc<Timeline>> {
        // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
        // temporary directory for basebackup files for the given timeline.

@@ -2980,6 +3193,7 @@ impl Tenant {
                    storage,
                    &self.tenant_shard_id,
                    &existing_initdb_timeline_id,
+                    &self.cancel,
                )
                .await
                .context("download initdb tar")?;
@@ -3020,6 +3234,7 @@ impl Tenant {
                            &timeline_id,
                            pgdata_zstd.try_clone().await?,
                            tar_zst_size,
+                            &self.cancel,
                        )
                        .await
                    },
@@ -3027,8 +3242,7 @@ impl Tenant {
                    3,
                    u32::MAX,
                    "persist_initdb_tar_zst",
-                    // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-                    backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+                    backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
                )
                .await?;

@@ -3143,11 +3357,11 @@ impl Tenant {
    /// at 'disk_consistent_lsn'. After any initial data has been imported, call
    /// `finish_creation` to insert the Timeline into the timelines map and to remove the
    /// uninit mark file.
-    async fn prepare_new_timeline(
-        &self,
+    async fn prepare_new_timeline<'a>(
+        &'a self,
        new_timeline_id: TimelineId,
        new_metadata: &TimelineMetadata,
-        uninit_mark: TimelineUninitMark,
+        uninit_mark: TimelineUninitMark<'a>,
        start_lsn: Lsn,
        ancestor: Option<Arc<Timeline>>,
    ) -> anyhow::Result<UninitializedTimeline> {
@@ -3220,23 +3434,38 @@ impl Tenant {
    fn create_timeline_uninit_mark(
        &self,
        timeline_id: TimelineId,
-        timelines: &MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
-    ) -> anyhow::Result<TimelineUninitMark> {
+    ) -> Result<TimelineUninitMark, TimelineExclusionError> {
        let tenant_shard_id = self.tenant_shard_id;

-        anyhow::ensure!(
-            timelines.get(&timeline_id).is_none(),
-            "Timeline {tenant_shard_id}/{timeline_id} already exists in pageserver's memory"
-        );
-        let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
-        anyhow::ensure!(
-            !timeline_path.exists(),
-            "Timeline {timeline_path} already exists, cannot create its uninit mark file",
-        );
-
        let uninit_mark_path = self
            .conf
            .timeline_uninit_mark_file_path(tenant_shard_id, timeline_id);
+        let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
+
+        let uninit_mark = TimelineUninitMark::new(
+            self,
+            timeline_id,
+            uninit_mark_path.clone(),
+            timeline_path.clone(),
+        )?;
+
+        // At this stage, we have got exclusive access to in-memory state for this timeline ID
+        // for creation.
+        // A timeline directory should never exist on disk already:
+        // - a previous failed creation would have cleaned up after itself
+        // - a pageserver restart would clean up timeline directories that don't have valid remote state
+        //
+        // Therefore it is an unexpected internal error to encounter a timeline directory already existing here,
+        // this error may indicate a bug in cleanup on failed creations.
+        if timeline_path.exists() {
+            return Err(TimelineExclusionError::Other(anyhow::anyhow!(
+                "Timeline directory already exists! This is a bug."
+            )));
+        }
+
+        // Create the on-disk uninit mark _after_ the in-memory acquisition of the tenant ID: guarantees
+        // that during process runtime, colliding creations will be caught in-memory without getting
+        // as far as failing to write a file.
        fs::OpenOptions::new()
            .write(true)
            .create_new(true)
@@ -3250,8 +3479,6 @@ impl Tenant {
                format!("Failed to crate uninit mark for timeline {tenant_shard_id}/{timeline_id}")
            })?;

-        let uninit_mark = TimelineUninitMark::new(uninit_mark_path, timeline_path);
-
        Ok(uninit_mark)
    }

@@ -3694,6 +3921,7 @@ pub(crate) mod harness {
                    tenant_conf.evictions_low_residence_duration_metric_threshold,
                ),
                gc_feedback: Some(tenant_conf.gc_feedback),
+                heatmap_period: Some(tenant_conf.heatmap_period),
            }
        }
    }
@@ -4000,13 +4228,7 @@ mod tests {
            .await
        {
            Ok(_) => panic!("duplicate timeline creation should fail"),
-            Err(e) => assert_eq!(
-                e.to_string(),
-                format!(
-                    "Timeline {}/{} already exists in pageserver's memory",
-                    tenant.tenant_shard_id, TIMELINE_ID
-                )
-            ),
+            Err(e) => assert_eq!(e.to_string(), "Already exists".to_string()),
        }

        Ok(())
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -334,6 +334,11 @@ pub struct TenantConf {
    #[serde(with = "humantime_serde")]
    pub evictions_low_residence_duration_metric_threshold: Duration,
    pub gc_feedback: bool,
+
+    /// If non-zero, the period between uploads of a heatmap from attached tenants.  This
+    /// may be disabled if a Tenant will not have secondary locations: only secondary
+    /// locations will use the heatmap uploaded by attached locations.
+    pub heatmap_period: Duration,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -414,6 +419,11 @@ pub struct TenantConfOpt {
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
    pub gc_feedback: Option<bool>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(with = "humantime_serde")]
+    #[serde(default)]
+    pub heatmap_period: Option<Duration>,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -482,6 +492,7 @@ impl TenantConfOpt {
                .evictions_low_residence_duration_metric_threshold
                .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
            gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
+            heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period),
        }
    }
 }
@@ -519,6 +530,7 @@ impl Default for TenantConf {
            )
            .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
            gc_feedback: false,
+            heatmap_period: Duration::ZERO,
        }
    }
 }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -71,6 +71,7 @@ async fn create_remote_delete_mark(
    conf: &PageServerConf,
    remote_storage: &GenericRemoteStorage,
    tenant_shard_id: &TenantShardId,
+    cancel: &CancellationToken,
 ) -> Result<(), DeleteTenantError> {
    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;

@@ -87,8 +88,7 @@ async fn create_remote_delete_mark(
        FAILED_UPLOAD_WARN_THRESHOLD,
        FAILED_REMOTE_OP_RETRIES,
        "mark_upload",
-        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-        backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+        backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
    )
    .await
    .context("mark_upload")?;
@@ -170,6 +170,7 @@ async fn remove_tenant_remote_delete_mark(
    conf: &PageServerConf,
    remote_storage: Option<&GenericRemoteStorage>,
    tenant_shard_id: &TenantShardId,
+    cancel: &CancellationToken,
 ) -> Result<(), DeleteTenantError> {
    if let Some(remote_storage) = remote_storage {
        let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
@@ -179,8 +180,7 @@ async fn remove_tenant_remote_delete_mark(
            FAILED_UPLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "remove_tenant_remote_delete_mark",
-            // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+            backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
        )
        .await
        .context("remove_tenant_remote_delete_mark")?;
@@ -322,9 +322,15 @@ impl DeleteTenantFlow {
        // Though sounds scary, different mark name?
        // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
        if let Some(remote_storage) = &remote_storage {
-            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id)
-                .await
-                .context("remote_mark")?
+            create_remote_delete_mark(
+                conf,
+                remote_storage,
+                &tenant.tenant_shard_id,
+                // Can't use tenant.cancel, it's already shut down.  TODO: wire in an appropriate token
+                &CancellationToken::new(),
+            )
+            .await
+            .context("remote_mark")?
        }

        fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
@@ -524,8 +530,14 @@ impl DeleteTenantFlow {
                .context("timelines dir not empty")?;
        }

-        remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_shard_id)
-            .await?;
+        remove_tenant_remote_delete_mark(
+            conf,
+            remote_storage.as_ref(),
+            &tenant.tenant_shard_id,
+            // Can't use tenant.cancel, it's already shut down.  TODO: wire in an appropriate token
+            &CancellationToken::new(),
+        )
+        .await?;

        fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
            Err(anyhow::anyhow!(
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -28,7 +28,7 @@ use crate::control_plane_client::{
    ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
 };
 use crate::deletion_queue::DeletionQueueClient;
-use crate::metrics::TENANT_MANAGER as METRICS;
+use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
@@ -44,7 +44,6 @@ use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};

 use super::delete::DeleteTenantError;
-use super::timeline::delete::DeleteTimelineFlow;
 use super::TenantSharedResources;

 /// For a tenant that appears in TenantsMap, it may either be
@@ -430,6 +429,13 @@ pub async fn init_tenant_mgr(
    let tenant_generations =
        init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;

+    tracing::info!(
+        "Attaching {} tenants at startup, warming up {} at a time",
+        tenant_configs.len(),
+        conf.concurrent_tenant_warmup.initial_permits()
+    );
+    TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);
+
    // Construct `Tenant` objects and start them running
    for (tenant_shard_id, location_conf) in tenant_configs {
        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
@@ -807,6 +813,12 @@ pub(crate) async fn set_new_tenant_config(
 }

 impl TenantManager {
+    /// Convenience function so that anyone with a TenantManager can get at the global configuration, without
+    /// having to pass it around everywhere as a separate object.
+    pub(crate) fn get_conf(&self) -> &'static PageServerConf {
+        self.conf
+    }
+
    /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query.
    /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
    pub(crate) fn get_attached_tenant_shard(
@@ -842,17 +854,6 @@ impl TenantManager {
        }
    }

-    pub(crate) async fn delete_timeline(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        _ctx: &RequestContext,
-    ) -> Result<(), DeleteTimelineError> {
-        let tenant = self.get_attached_tenant_shard(tenant_shard_id, true)?;
-        DeleteTimelineFlow::run(&tenant, timeline_id, false).await?;
-        Ok(())
-    }
-
    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
    pub(crate) async fn upsert_location(
        &self,
@@ -1087,6 +1088,20 @@ impl TenantManager {

        Ok(())
    }
+
+    pub(crate) fn get_attached_active_tenant_shards(&self) -> Vec<Arc<Tenant>> {
+        let locked = self.tenants.read().unwrap();
+        match &*locked {
+            TenantsMap::Initializing => Vec::new(),
+            TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => map
+                .values()
+                .filter_map(|slot| {
+                    slot.get_attached()
+                        .and_then(|t| if t.is_active() { Some(t.clone()) } else { None })
+                })
+                .collect(),
+        }
+    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -1201,7 +1216,10 @@ pub(crate) async fn get_active_tenant_with_timeout(
                        // Fast path: we don't need to do any async waiting.
                        return Ok(tenant.clone());
                    }
-                    _ => (WaitFor::Tenant(tenant.clone()), tenant_shard_id),
+                    _ => {
+                        tenant.activate_now();
+                        (WaitFor::Tenant(tenant.clone()), tenant_shard_id)
+                    }
                }
            }
            Some(TenantSlot::Secondary) => {
@@ -1255,28 +1273,10 @@ pub(crate) async fn get_active_tenant_with_timeout(
    };

    tracing::debug!("Waiting for tenant to enter active state...");
-    match timeout_cancellable(
-        deadline.duration_since(Instant::now()),
-        cancel,
-        tenant.wait_to_become_active(),
-    )
-    .await
-    {
-        Ok(Ok(())) => Ok(tenant),
-        Ok(Err(e)) => Err(e),
-        Err(TimeoutCancellableError::Timeout) => {
-            let latest_state = tenant.current_state();
-            if latest_state == TenantState::Active {
-                Ok(tenant)
-            } else {
-                Err(GetActiveTenantError::WaitForActiveTimeout {
-                    latest_state: Some(latest_state),
-                    wait_time: timeout,
-                })
-            }
-        }
-        Err(TimeoutCancellableError::Cancelled) => Err(GetActiveTenantError::Cancelled),
-    }
+    tenant
+        .wait_to_become_active(deadline.duration_since(Instant::now()))
+        .await?;
+    Ok(tenant)
 }

 pub(crate) async fn delete_tenant(
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -180,7 +180,7 @@
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map

-mod download;
+pub(crate) mod download;
 pub mod index;
 mod upload;

@@ -196,10 +196,12 @@ pub(crate) use upload::upload_initdb_dir;
 use utils::backoff::{
    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
+use utils::timeout::{timeout_cancellable, TimeoutCancellableError};

 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
+use std::time::Duration;

 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use std::ops::DerefMut;
@@ -316,6 +318,47 @@ pub struct RemoteTimelineClient {
    storage_impl: GenericRemoteStorage,

    deletion_queue_client: DeletionQueueClient,
+
+    cancel: CancellationToken,
+}
+
+/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows.  It is not
+/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
+const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
+const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
+
+/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow.
+///
+/// This is a convenience for the various upload functions.  In future
+/// the anyhow::Error result should be replaced with a more structured type that
+/// enables callers to avoid handling shutdown as an error.
+async fn upload_cancellable<F>(cancel: &CancellationToken, future: F) -> anyhow::Result<()>
+where
+    F: std::future::Future<Output = anyhow::Result<()>>,
+{
+    match timeout_cancellable(UPLOAD_TIMEOUT, cancel, future).await {
+        Ok(Ok(())) => Ok(()),
+        Ok(Err(e)) => Err(e),
+        Err(TimeoutCancellableError::Timeout) => Err(anyhow::anyhow!("Timeout")),
+        Err(TimeoutCancellableError::Cancelled) => Err(anyhow::anyhow!("Shutting down")),
+    }
+}
+/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to DownloaDError.
+async fn download_cancellable<F, R>(
+    cancel: &CancellationToken,
+    future: F,
+) -> Result<R, DownloadError>
+where
+    F: std::future::Future<Output = Result<R, DownloadError>>,
+{
+    match timeout_cancellable(DOWNLOAD_TIMEOUT, cancel, future).await {
+        Ok(Ok(r)) => Ok(r),
+        Ok(Err(e)) => Err(e),
+        Err(TimeoutCancellableError::Timeout) => {
+            Err(DownloadError::Other(anyhow::anyhow!("Timed out")))
+        }
+        Err(TimeoutCancellableError::Cancelled) => Err(DownloadError::Cancelled),
+    }
 }

 impl RemoteTimelineClient {
@@ -351,6 +394,7 @@ impl RemoteTimelineClient {
                &tenant_shard_id,
                &timeline_id,
            )),
+            cancel: CancellationToken::new(),
        }
    }

@@ -501,6 +545,7 @@ impl RemoteTimelineClient {
        &self,
        layer_file_name: &LayerFileName,
        layer_metadata: &LayerFileMetadata,
+        cancel: &CancellationToken,
    ) -> anyhow::Result<u64> {
        let downloaded_size = {
            let _unfinished_gauge_guard = self.metrics.call_begin(
@@ -517,6 +562,7 @@ impl RemoteTimelineClient {
                self.timeline_id,
                layer_file_name,
                layer_metadata,
+                cancel,
            )
            .measure_remote_op(
                self.tenant_shard_id.tenant_id,
@@ -971,6 +1017,7 @@ impl RemoteTimelineClient {
                    &self.timeline_id,
                    self.generation,
                    &index_part_with_deleted_at,
+                    &self.cancel,
                )
            },
            |_e| false,
@@ -980,8 +1027,7 @@ impl RemoteTimelineClient {
            // when executed as part of tenant deletion this happens in the background
            2,
            "persist_index_part_with_deleted_flag",
-            // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
        )
        .await?;

@@ -1281,6 +1327,7 @@ impl RemoteTimelineClient {
                        path,
                        layer_metadata,
                        self.generation,
+                        &self.cancel,
                    )
                    .measure_remote_op(
                        self.tenant_shard_id.tenant_id,
@@ -1307,6 +1354,7 @@ impl RemoteTimelineClient {
                        &self.timeline_id,
                        self.generation,
                        index_part,
+                        &self.cancel,
                    )
                    .measure_remote_op(
                        self.tenant_shard_id.tenant_id,
@@ -1604,6 +1652,23 @@ impl RemoteTimelineClient {
            }
        }
    }
+
+    pub(crate) fn get_layers_metadata(
+        &self,
+        layers: Vec<LayerFileName>,
+    ) -> anyhow::Result<Vec<Option<LayerFileMetadata>>> {
+        let q = self.upload_queue.lock().unwrap();
+        let q = match &*q {
+            UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
+                anyhow::bail!("queue is in state {}", q.as_str())
+            }
+            UploadQueue::Initialized(inner) => inner,
+        };
+
+        let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned());
+
+        Ok(decorated.collect())
+    }
 }

 pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
@@ -1659,6 +1724,13 @@ pub fn remote_index_path(
    .expect("Failed to construct path")
 }

+pub const HEATMAP_BASENAME: &str = "heatmap-v1.json";
+
+pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath {
+    RemotePath::from_string(&format!("tenants/{tenant_shard_id}/{HEATMAP_BASENAME}"))
+        .expect("Failed to construct path")
+}
+
 /// Given the key of an index, parse out the generation part of the name
 pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
    let file_name = match path.get_path().file_name() {
@@ -1804,6 +1876,7 @@ mod tests {
                    &self.harness.tenant_shard_id,
                    &TIMELINE_ID,
                )),
+                cancel: CancellationToken::new(),
            })
        }

--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -5,7 +5,6 @@

 use std::collections::HashSet;
 use std::future::Future;
-use std::time::Duration;

 use anyhow::{anyhow, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -14,13 +13,17 @@ use tokio::fs::{self, File, OpenOptions};
 use tokio::io::{AsyncSeekExt, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
+use utils::timeout::timeout_cancellable;
 use utils::{backoff, crashsafe};

 use crate::config::PageServerConf;
-use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
+use crate::tenant::remote_timeline_client::{
+    download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT,
+};
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::Generation;
+use crate::virtual_file::on_fatal_io_error;
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
 use utils::crashsafe::path_with_suffix_extension;
@@ -32,8 +35,6 @@ use super::{
    FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
 };

-static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
-
 ///
 /// If 'metadata' is given, we will validate that the downloaded file's size matches that
 /// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
@@ -46,6 +47,7 @@ pub async fn download_layer_file<'a>(
    timeline_id: TimelineId,
    layer_file_name: &'a LayerFileName,
    layer_metadata: &'a LayerFileMetadata,
+    cancel: &CancellationToken,
 ) -> Result<u64, DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

@@ -73,14 +75,18 @@ pub async fn download_layer_file<'a>(
    // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
    let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);

+    let cancel_inner = cancel.clone();
    let (mut destination_file, bytes_amount) = download_retry(
        || async {
            let destination_file = tokio::fs::File::create(&temp_file_path)
                .await
                .with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
                .map_err(DownloadError::Other)?;
-            let download = storage
-                .download(&remote_path)
+
+            // Cancellation safety: it is safe to cancel this future, because it isn't writing to a local
+            // file: the write to local file doesn't start until after the request header is returned
+            // and we start draining the body stream below
+            let download = download_cancellable(&cancel_inner, storage.download(&remote_path))
                .await
                .with_context(|| {
                    format!(
@@ -94,12 +100,33 @@ pub async fn download_layer_file<'a>(

            let mut reader = tokio_util::io::StreamReader::new(download.download_stream);

-            let bytes_amount = tokio::time::timeout(
-                MAX_DOWNLOAD_DURATION,
+            // Cancellation safety: it is safe to cancel this future because it is writing into a temporary file,
+            // and we will unlink the temporary file if there is an error.  This unlink is important because we
+            // are in a retry loop, and we wouldn't want to leave behind a rogue write I/O to a file that
+            // we will imminiently try and write to again.
+            let bytes_amount: u64 = match timeout_cancellable(
+                DOWNLOAD_TIMEOUT,
+                &cancel_inner,
                tokio::io::copy_buf(&mut reader, &mut destination_file),
            )
            .await
-            .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out  {:?}", e)))?
+            .with_context(|| {
+                format!(
+                    "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
+                )
+            })
+            .map_err(DownloadError::Other)?
+            {
+                Ok(b) => Ok(b),
+                Err(e) => {
+                    // Remove incomplete files: on restart Timeline would do this anyway, but we must
+                    // do it here for the retry case.
+                    if let Err(e) = tokio::fs::remove_file(&temp_file_path).await {
+                        on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
+                    }
+                    Err(e)
+                }
+            }
            .with_context(|| {
                format!(
                    "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
@@ -112,6 +139,7 @@ pub async fn download_layer_file<'a>(
            Ok((destination_file, bytes_amount))
        },
        &format!("download {remote_path:?}"),
+        cancel,
    )
    .await?;

@@ -188,8 +216,14 @@ pub async fn list_remote_timelines(
        anyhow::bail!("storage-sync-list-remote-timelines");
    });

+    let cancel_inner = cancel.clone();
    let listing = download_retry_forever(
-        || storage.list(Some(&remote_path), ListingMode::WithDelimiter),
+        || {
+            download_cancellable(
+                &cancel_inner,
+                storage.list(Some(&remote_path), ListingMode::WithDelimiter),
+            )
+        },
        &format!("list timelines for {tenant_shard_id}"),
        cancel,
    )
@@ -230,9 +264,13 @@ async fn do_download_index_part(

    let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);

+    let cancel_inner = cancel.clone();
    let index_part_bytes = download_retry_forever(
        || async {
-            let index_part_download = storage.download(&remote_path).await?;
+            // Cancellation: if is safe to cancel this future because we're just downloading into
+            // a memory buffer, not touching local disk.
+            let index_part_download =
+                download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;

            let mut index_part_bytes = Vec::new();
            let mut stream = std::pin::pin!(index_part_download.download_stream);
@@ -347,10 +385,7 @@ pub(super) async fn download_index_part(
        FAILED_DOWNLOAD_WARN_THRESHOLD,
        FAILED_REMOTE_OP_RETRIES,
        "listing index_part files",
-        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-        backoff::Cancel::new(CancellationToken::new(), || -> anyhow::Error {
-            unreachable!()
-        }),
+        backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
    )
    .await
    .map_err(DownloadError::Other)?;
@@ -389,6 +424,7 @@ pub(crate) async fn download_initdb_tar_zst(
    storage: &GenericRemoteStorage,
    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
+    cancel: &CancellationToken,
 ) -> Result<(Utf8PathBuf, File), DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

@@ -406,6 +442,8 @@ pub(crate) async fn download_initdb_tar_zst(
        "{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}"
    ));

+    let cancel_inner = cancel.clone();
+
    let file = download_retry(
        || async {
            let file = OpenOptions::new()
@@ -418,10 +456,14 @@ pub(crate) async fn download_initdb_tar_zst(
                .with_context(|| format!("tempfile creation {temp_path}"))
                .map_err(DownloadError::Other)?;

-            let download = storage.download(&remote_path).await?;
+            let download =
+                download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;
            let mut download = tokio_util::io::StreamReader::new(download.download_stream);
            let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);

+            // TODO: this consumption of the response body should be subject to timeout + cancellation, but
+            // not without thinking carefully about how to recover safely from cancelling a write to
+            // local storage (e.g. by writing into a temp file as we do in download_layer)
            tokio::io::copy_buf(&mut download, &mut writer)
                .await
                .with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
@@ -437,6 +479,7 @@ pub(crate) async fn download_initdb_tar_zst(
            Ok(file)
        },
        &format!("download {remote_path}"),
+        cancel,
    )
    .await
    .map_err(|e| {
@@ -460,7 +503,11 @@ pub(crate) async fn download_initdb_tar_zst(
 /// with backoff.
 ///
 /// (See similar logic for uploads in `perform_upload_task`)
-async fn download_retry<T, O, F>(op: O, description: &str) -> Result<T, DownloadError>
+async fn download_retry<T, O, F>(
+    op: O,
+    description: &str,
+    cancel: &CancellationToken,
+) -> Result<T, DownloadError>
 where
    O: FnMut() -> F,
    F: Future<Output = Result<T, DownloadError>>,
@@ -471,10 +518,7 @@ where
        FAILED_DOWNLOAD_WARN_THRESHOLD,
        FAILED_REMOTE_OP_RETRIES,
        description,
-        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-        backoff::Cancel::new(CancellationToken::new(), || -> DownloadError {
-            unreachable!()
-        }),
+        backoff::Cancel::new(cancel.clone(), || DownloadError::Cancelled),
    )
    .await
 }
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -4,14 +4,17 @@ use anyhow::{bail, Context};
 use camino::Utf8Path;
 use fail::fail_point;
 use pageserver_api::shard::TenantShardId;
-use std::io::ErrorKind;
+use std::io::{ErrorKind, SeekFrom};
 use tokio::fs::{self, File};
+use tokio::io::AsyncSeekExt;
+use tokio_util::sync::CancellationToken;

 use super::Generation;
 use crate::{
    config::PageServerConf,
    tenant::remote_timeline_client::{
        index::IndexPart, remote_index_path, remote_initdb_archive_path, remote_path,
+        upload_cancellable,
    },
 };
 use remote_storage::GenericRemoteStorage;
@@ -28,6 +31,7 @@ pub(super) async fn upload_index_part<'a>(
    timeline_id: &TimelineId,
    generation: Generation,
    index_part: &'a IndexPart,
+    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    tracing::trace!("uploading new index part");

@@ -43,14 +47,16 @@ pub(super) async fn upload_index_part<'a>(
    let index_part_bytes = bytes::Bytes::from(index_part_bytes);

    let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
-    storage
-        .upload_storage_object(
+    upload_cancellable(
+        cancel,
+        storage.upload_storage_object(
            futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
            index_part_size,
            &remote_path,
-        )
-        .await
-        .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
+        ),
+    )
+    .await
+    .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
 }

 /// Attempts to upload given layer files.
@@ -63,6 +69,7 @@ pub(super) async fn upload_timeline_layer<'a>(
    source_path: &'a Utf8Path,
    known_metadata: &'a LayerFileMetadata,
    generation: Generation,
+    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    fail_point!("before-upload-layer", |_| {
        bail!("failpoint before-upload-layer")
@@ -106,8 +113,7 @@ pub(super) async fn upload_timeline_layer<'a>(

    let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);

-    storage
-        .upload(reader, fs_size, &storage_path, None)
+    upload_cancellable(cancel, storage.upload(reader, fs_size, &storage_path, None))
        .await
        .with_context(|| format!("upload layer from local path '{source_path}'"))?;

@@ -119,16 +125,22 @@ pub(crate) async fn upload_initdb_dir(
    storage: &GenericRemoteStorage,
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
-    initdb_tar_zst: File,
+    mut initdb_tar_zst: File,
    size: u64,
+    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    tracing::trace!("uploading initdb dir");

+    // We might have read somewhat into the file already in the prior retry attempt
+    initdb_tar_zst.seek(SeekFrom::Start(0)).await?;
+
    let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);

    let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
-    storage
-        .upload_storage_object(file, size as usize, &remote_path)
-        .await
-        .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
+    upload_cancellable(
+        cancel,
+        storage.upload_storage_object(file, size as usize, &remote_path),
+    )
+    .await
+    .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
 }
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -0,0 +1,104 @@
+pub mod heatmap;
+mod heatmap_uploader;
+
+use std::sync::Arc;
+
+use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
+
+use self::heatmap_uploader::heatmap_uploader_task;
+
+use super::mgr::TenantManager;
+
+use pageserver_api::shard::TenantShardId;
+use remote_storage::GenericRemoteStorage;
+
+use tokio_util::sync::CancellationToken;
+use utils::completion::Barrier;
+
+enum UploadCommand {
+    Upload(TenantShardId),
+}
+
+struct CommandRequest<T> {
+    payload: T,
+    response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
+}
+
+struct CommandResponse {
+    result: anyhow::Result<()>,
+}
+
+/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
+/// and heatmap uploads.  This is not a hot data path: it's primarily a hook for tests,
+/// where we want to immediately upload/download for a particular tenant.  In normal operation
+/// uploads & downloads are autonomous and not driven by this interface.
+pub struct SecondaryController {
+    upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
+}
+
+impl SecondaryController {
+    async fn dispatch<T>(
+        &self,
+        queue: &tokio::sync::mpsc::Sender<CommandRequest<T>>,
+        payload: T,
+    ) -> anyhow::Result<()> {
+        let (response_tx, response_rx) = tokio::sync::oneshot::channel();
+
+        queue
+            .send(CommandRequest {
+                payload,
+                response_tx,
+            })
+            .await
+            .map_err(|_| anyhow::anyhow!("Receiver shut down"))?;
+
+        let response = response_rx
+            .await
+            .map_err(|_| anyhow::anyhow!("Request dropped"))?;
+
+        response.result
+    }
+
+    pub async fn upload_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
+        self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
+            .await
+    }
+}
+
+pub fn spawn_tasks(
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    background_jobs_can_start: Barrier,
+    cancel: CancellationToken,
+) -> SecondaryController {
+    let (upload_req_tx, upload_req_rx) =
+        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
+
+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::SecondaryUploads,
+        None,
+        None,
+        "heatmap uploads",
+        false,
+        async move {
+            heatmap_uploader_task(
+                tenant_manager,
+                remote_storage,
+                upload_req_rx,
+                background_jobs_can_start,
+                cancel,
+            )
+            .await
+        },
+    );
+
+    SecondaryController { upload_req_tx }
+}
+
+/// For running with remote storage disabled: a SecondaryController that is connected to nothing.
+pub fn null_controller() -> SecondaryController {
+    let (upload_req_tx, _upload_req_rx) =
+        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
+    SecondaryController { upload_req_tx }
+}
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -0,0 +1,64 @@
+use std::time::SystemTime;
+
+use crate::tenant::{
+    remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName,
+};
+
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
+
+use utils::{generation::Generation, id::TimelineId};
+
+#[derive(Serialize, Deserialize)]
+pub(super) struct HeatMapTenant {
+    /// Generation of the attached location that uploaded the heatmap: this is not required
+    /// for correctness, but acts as a hint to secondary locations in order to detect thrashing
+    /// in the unlikely event that two attached locations are both uploading conflicting heatmaps.
+    pub(super) generation: Generation,
+
+    pub(super) timelines: Vec<HeatMapTimeline>,
+}
+
+#[serde_as]
+#[derive(Serialize, Deserialize)]
+pub(crate) struct HeatMapTimeline {
+    #[serde_as(as = "DisplayFromStr")]
+    pub(super) timeline_id: TimelineId,
+
+    pub(super) layers: Vec<HeatMapLayer>,
+}
+
+#[serde_as]
+#[derive(Serialize, Deserialize)]
+pub(crate) struct HeatMapLayer {
+    pub(super) name: LayerFileName,
+    pub(super) metadata: IndexLayerMetadata,
+
+    #[serde_as(as = "TimestampSeconds<i64>")]
+    pub(super) access_time: SystemTime,
+    // TODO: an actual 'heat' score that would let secondary locations prioritize downloading
+    // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
+}
+
+impl HeatMapLayer {
+    pub(crate) fn new(
+        name: LayerFileName,
+        metadata: IndexLayerMetadata,
+        access_time: SystemTime,
+    ) -> Self {
+        Self {
+            name,
+            metadata,
+            access_time,
+        }
+    }
+}
+
+impl HeatMapTimeline {
+    pub(crate) fn new(timeline_id: TimelineId, layers: Vec<HeatMapLayer>) -> Self {
+        Self {
+            timeline_id,
+            layers,
+        }
+    }
+}
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -0,0 +1,582 @@
+use std::{
+    collections::HashMap,
+    sync::{Arc, Weak},
+    time::{Duration, Instant},
+};
+
+use crate::{
+    metrics::SECONDARY_MODE,
+    tenant::{
+        config::AttachmentMode, mgr::TenantManager, remote_timeline_client::remote_heatmap_path,
+        secondary::CommandResponse, span::debug_assert_current_span_has_tenant_id, Tenant,
+    },
+};
+
+use md5;
+use pageserver_api::shard::TenantShardId;
+use remote_storage::GenericRemoteStorage;
+
+use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
+use tracing::instrument;
+use utils::{backoff, completion::Barrier};
+
+use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand};
+
+/// Period between heatmap uploader walking Tenants to look for work to do.
+/// If any tenants have a heatmap upload period lower than this, it will be adjusted
+/// downward to match.
+const DEFAULT_SCHEDULING_INTERVAL: Duration = Duration::from_millis(60000);
+const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_millis(1000);
+
+struct WriteInProgress {
+    barrier: Barrier,
+}
+
+struct UploadPending {
+    tenant: Arc<Tenant>,
+    last_digest: Option<md5::Digest>,
+}
+
+struct WriteComplete {
+    tenant_shard_id: TenantShardId,
+    completed_at: Instant,
+    digest: Option<md5::Digest>,
+    next_upload: Option<Instant>,
+}
+
+/// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember
+/// when we last did a write.  We only populate this after doing at least one
+/// write for a tenant -- this avoids holding state for tenants that have
+/// uploads disabled.
+
+struct UploaderTenantState {
+    // This Weak only exists to enable culling idle instances of this type
+    // when the Tenant has been deallocated.
+    tenant: Weak<Tenant>,
+
+    /// Digest of the serialized heatmap that we last successfully uploaded
+    ///
+    /// md5 is generally a bad hash.  We use it because it's convenient for interop with AWS S3's ETag,
+    /// which is also an md5sum.
+    last_digest: Option<md5::Digest>,
+
+    /// When the last upload attempt completed (may have been successful or failed)
+    last_upload: Option<Instant>,
+
+    /// When should we next do an upload?  None means never.
+    next_upload: Option<Instant>,
+}
+
+/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
+/// handling loop and mutates it as needed: there are no locks here, because that event loop
+/// can hold &mut references to this type throughout.
+struct HeatmapUploader {
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    cancel: CancellationToken,
+
+    tenants: HashMap<TenantShardId, UploaderTenantState>,
+
+    /// Tenants with work to do, for which tasks should be spawned as soon as concurrency
+    /// limits permit it.
+    tenants_pending: std::collections::VecDeque<UploadPending>,
+
+    /// Tenants for which a task in `tasks` has been spawned.
+    tenants_uploading: HashMap<TenantShardId, WriteInProgress>,
+
+    tasks: JoinSet<()>,
+
+    /// Channel for our child tasks to send results to: we use a channel for results rather than
+    /// just getting task results via JoinSet because we need the channel's recv() "sleep until something
+    /// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty"
+    /// behavior.
+    task_result_tx: tokio::sync::mpsc::UnboundedSender<WriteComplete>,
+    task_result_rx: tokio::sync::mpsc::UnboundedReceiver<WriteComplete>,
+
+    concurrent_uploads: usize,
+
+    scheduling_interval: Duration,
+}
+
+/// The uploader task runs a loop that periodically wakes up and schedules tasks for
+/// tenants that require an upload, or handles any commands that have been sent into
+/// `command_queue`.  No I/O is done in this loop: that all happens in the tasks we
+/// spawn.
+///
+/// Scheduling iterations are somewhat infrequent.  However, each one will enqueue
+/// all tenants that require an upload, and in between scheduling iterations we will
+/// continue to spawn new tasks for pending tenants, as our concurrency limit permits.
+///
+/// While we take a CancellationToken here, it is subordinate to the CancellationTokens
+/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise
+/// we might block waiting on a Tenant.
+pub(super) async fn heatmap_uploader_task(
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
+    background_jobs_can_start: Barrier,
+    cancel: CancellationToken,
+) -> anyhow::Result<()> {
+    let concurrent_uploads = tenant_manager.get_conf().heatmap_upload_concurrency;
+
+    let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
+
+    let mut uploader = HeatmapUploader {
+        tenant_manager,
+        remote_storage,
+        cancel: cancel.clone(),
+        tasks: JoinSet::new(),
+        tenants: HashMap::new(),
+        tenants_pending: std::collections::VecDeque::new(),
+        tenants_uploading: HashMap::new(),
+        task_result_tx: result_tx,
+        task_result_rx: result_rx,
+        concurrent_uploads,
+        scheduling_interval: DEFAULT_SCHEDULING_INTERVAL,
+    };
+
+    tracing::info!("Waiting for background_jobs_can start...");
+    background_jobs_can_start.wait().await;
+    tracing::info!("background_jobs_can is ready, proceeding.");
+
+    while !cancel.is_cancelled() {
+        // Look for new work: this is relatively expensive because we have to go acquire the lock on
+        // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
+        // require an upload.
+        uploader.schedule_iteration().await?;
+
+        // Between scheduling iterations, we will:
+        //  - Drain any complete tasks and spawn pending tasks
+        //  - Handle incoming administrative commands
+        //  - Check our cancellation token
+        let next_scheduling_iteration = Instant::now()
+            .checked_add(uploader.scheduling_interval)
+            .unwrap_or_else(|| {
+                tracing::warn!(
+                    "Scheduling interval invalid ({}s), running immediately!",
+                    uploader.scheduling_interval.as_secs_f64()
+                );
+                Instant::now()
+            });
+        loop {
+            tokio::select! {
+                _ = cancel.cancelled() => {
+                    // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
+                    tracing::info!("Heatmap uploader joining tasks");
+                    while let Some(_r) = uploader.tasks.join_next().await {};
+                    tracing::info!("Heatmap uploader terminating");
+
+                    break;
+                },
+                _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
+                    tracing::debug!("heatmap_uploader_task: woke for scheduling interval");
+                    break;},
+                cmd = command_queue.recv() => {
+                    tracing::debug!("heatmap_uploader_task: woke for command queue");
+                    let cmd = match cmd {
+                        Some(c) =>c,
+                        None => {
+                            // SecondaryController was destroyed, and this has raced with
+                            // our CancellationToken
+                            tracing::info!("Heatmap uploader terminating");
+                            cancel.cancel();
+                            break;
+                        }
+                    };
+
+                    let CommandRequest{
+                        response_tx,
+                        payload
+                    } = cmd;
+                    uploader.handle_command(payload, response_tx);
+                },
+                _ = uploader.process_next_completion() => {
+                    if !cancel.is_cancelled() {
+                        uploader.spawn_pending();
+                    }
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
+
+impl HeatmapUploader {
+    /// Periodic execution phase: inspect all attached tenants and schedule any work they require.
+    async fn schedule_iteration(&mut self) -> anyhow::Result<()> {
+        // Cull any entries in self.tenants whose Arc<Tenant> is gone
+        self.tenants
+            .retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some());
+
+        // The priority order of previously scheduled work may be invalidated by current state: drop
+        // all pending work (it will be re-scheduled if still needed)
+        self.tenants_pending.clear();
+
+        // Used a fixed 'now' through the following loop, for efficiency and fairness.
+        let now = Instant::now();
+
+        // While iterating over the potentially-long list of tenants, we will periodically yield
+        // to avoid blocking executor.
+        const YIELD_ITERATIONS: usize = 1000;
+
+        // Iterate over tenants looking for work to do.
+        let tenants = self.tenant_manager.get_attached_active_tenant_shards();
+        for (i, tenant) in tenants.into_iter().enumerate() {
+            // Process is shutting down, drop out
+            if self.cancel.is_cancelled() {
+                return Ok(());
+            }
+
+            // Skip tenants that already have a write in flight
+            if self
+                .tenants_uploading
+                .contains_key(tenant.get_tenant_shard_id())
+            {
+                continue;
+            }
+
+            self.maybe_schedule_upload(&now, tenant);
+
+            if i + 1 % YIELD_ITERATIONS == 0 {
+                tokio::task::yield_now().await;
+            }
+        }
+
+        // Spawn tasks for as many of our pending tenants as we can.
+        self.spawn_pending();
+
+        Ok(())
+    }
+
+    ///
+    /// Cancellation: this method is cancel-safe.
+    async fn process_next_completion(&mut self) {
+        match self.task_result_rx.recv().await {
+            Some(r) => {
+                self.on_completion(r);
+            }
+            None => {
+                unreachable!("Result sender is stored on Self");
+            }
+        }
+    }
+
+    /// The 'maybe' refers to the tenant's state: whether it is configured
+    /// for heatmap uploads at all, and whether sufficient time has passed
+    /// since the last upload.
+    fn maybe_schedule_upload(&mut self, now: &Instant, tenant: Arc<Tenant>) {
+        match tenant.get_heatmap_period() {
+            None => {
+                // Heatmaps are disabled for this tenant
+                return;
+            }
+            Some(period) => {
+                // If any tenant has asked for uploads more frequent than our scheduling interval,
+                // reduce it to match so that we can keep up.  This is mainly useful in testing, where
+                // we may set rather short intervals.
+                if period < self.scheduling_interval {
+                    self.scheduling_interval = std::cmp::max(period, MIN_SCHEDULING_INTERVAL);
+                }
+            }
+        }
+
+        // Stale attachments do not upload anything: if we are in this state, there is probably some
+        // other attachment in mode Single or Multi running on another pageserver, and we don't
+        // want to thrash and overwrite their heatmap uploads.
+        if tenant.get_attach_mode() == AttachmentMode::Stale {
+            return;
+        }
+
+        // Create an entry in self.tenants if one doesn't already exist: this will later be updated
+        // with the completion time in on_completion.
+        let state = self
+            .tenants
+            .entry(*tenant.get_tenant_shard_id())
+            .or_insert_with(|| UploaderTenantState {
+                tenant: Arc::downgrade(&tenant),
+                last_upload: None,
+                next_upload: Some(Instant::now()),
+                last_digest: None,
+            });
+
+        // Decline to do the upload if insufficient time has passed
+        if state.next_upload.map(|nu| &nu > now).unwrap_or(false) {
+            return;
+        }
+
+        let last_digest = state.last_digest;
+        self.tenants_pending.push_back(UploadPending {
+            tenant,
+            last_digest,
+        })
+    }
+
+    fn spawn_pending(&mut self) {
+        while !self.tenants_pending.is_empty()
+            && self.tenants_uploading.len() < self.concurrent_uploads
+        {
+            // unwrap: loop condition includes !is_empty()
+            let pending = self.tenants_pending.pop_front().unwrap();
+            self.spawn_upload(pending.tenant, pending.last_digest);
+        }
+    }
+
+    fn spawn_upload(&mut self, tenant: Arc<Tenant>, last_digest: Option<md5::Digest>) {
+        let remote_storage = self.remote_storage.clone();
+        let tenant_shard_id = *tenant.get_tenant_shard_id();
+        let (completion, barrier) = utils::completion::channel();
+        let result_tx = self.task_result_tx.clone();
+        self.tasks.spawn(async move {
+            // Guard for the barrier in [`WriteInProgress`]
+            let _completion = completion;
+
+            let started_at = Instant::now();
+            let digest = match upload_tenant_heatmap(remote_storage, &tenant, last_digest).await {
+                Ok(UploadHeatmapOutcome::Uploaded(digest)) => {
+                    let duration = Instant::now().duration_since(started_at);
+                    SECONDARY_MODE
+                        .upload_heatmap_duration
+                        .observe(duration.as_secs_f64());
+                    SECONDARY_MODE.upload_heatmap.inc();
+                    Some(digest)
+                }
+                Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_digest,
+                Err(UploadHeatmapError::Upload(e)) => {
+                    tracing::warn!(
+                        "Failed to upload heatmap for tenant {}: {e:#}",
+                        tenant.get_tenant_shard_id(),
+                    );
+                    let duration = Instant::now().duration_since(started_at);
+                    SECONDARY_MODE
+                        .upload_heatmap_duration
+                        .observe(duration.as_secs_f64());
+                    SECONDARY_MODE.upload_heatmap_errors.inc();
+                    last_digest
+                }
+                Err(UploadHeatmapError::Cancelled) => {
+                    tracing::info!("Cancelled heatmap upload, shutting down");
+                    last_digest
+                }
+            };
+
+            let now = Instant::now();
+            let next_upload = tenant
+                .get_heatmap_period()
+                .and_then(|period| now.checked_add(period));
+
+            result_tx
+                .send(WriteComplete {
+                    tenant_shard_id: *tenant.get_tenant_shard_id(),
+                    completed_at: now,
+                    digest,
+                    next_upload,
+                })
+                .ok();
+        });
+
+        self.tenants_uploading
+            .insert(tenant_shard_id, WriteInProgress { barrier });
+    }
+
+    #[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))]
+    fn on_completion(&mut self, completion: WriteComplete) {
+        tracing::debug!("Heatmap upload completed");
+        let WriteComplete {
+            tenant_shard_id,
+            completed_at,
+            digest,
+            next_upload,
+        } = completion;
+        self.tenants_uploading.remove(&tenant_shard_id);
+        use std::collections::hash_map::Entry;
+        match self.tenants.entry(tenant_shard_id) {
+            Entry::Vacant(_) => {
+                // Tenant state was dropped, nothing to update.
+            }
+            Entry::Occupied(mut entry) => {
+                entry.get_mut().last_upload = Some(completed_at);
+                entry.get_mut().last_digest = digest;
+                entry.get_mut().next_upload = next_upload
+            }
+        }
+    }
+
+    fn handle_command(
+        &mut self,
+        command: UploadCommand,
+        response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
+    ) {
+        match command {
+            UploadCommand::Upload(tenant_shard_id) => {
+                // If an upload was ongoing for this tenant, let it finish first.
+                let barrier = if let Some(writing_state) =
+                    self.tenants_uploading.get(&tenant_shard_id)
+                {
+                    tracing::info!(
+                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                        "Waiting for heatmap write to complete");
+                    writing_state.barrier.clone()
+                } else {
+                    // Spawn the upload then immediately wait for it.  This will block processing of other commands and
+                    // starting of other background work.
+                    tracing::info!(
+                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                        "Starting heatmap write on command");
+                    let tenant = match self
+                        .tenant_manager
+                        .get_attached_tenant_shard(tenant_shard_id, true)
+                    {
+                        Ok(t) => t,
+                        Err(e) => {
+                            // Drop result of send: we don't care if caller dropped their receiver
+                            drop(response_tx.send(CommandResponse {
+                                result: Err(e.into()),
+                            }));
+                            return;
+                        }
+                    };
+                    self.spawn_upload(tenant, None);
+                    let writing_state = self
+                        .tenants_uploading
+                        .get(&tenant_shard_id)
+                        .expect("We just inserted this");
+                    tracing::info!(
+                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                        "Waiting for heatmap upload to complete");
+
+                    writing_state.barrier.clone()
+                };
+
+                // This task does no I/O: it only listens for a barrier's completion and then
+                // sends to the command response channel.  It is therefore safe to spawn this without
+                // any gates/task_mgr hooks.
+                tokio::task::spawn(async move {
+                    barrier.wait().await;
+
+                    tracing::info!(
+                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                        "Heatmap upload complete");
+
+                    // Drop result of send: we don't care if caller dropped their receiver
+                    drop(response_tx.send(CommandResponse { result: Ok(()) }))
+                });
+            }
+        }
+    }
+}
+
+enum UploadHeatmapOutcome {
+    /// We successfully wrote to remote storage, with this digest.
+    Uploaded(md5::Digest),
+    /// We did not upload because the heatmap digest was unchanged since the last upload
+    NoChange,
+    /// We skipped the upload for some reason, such as tenant/timeline not ready
+    Skipped,
+}
+
+#[derive(thiserror::Error, Debug)]
+enum UploadHeatmapError {
+    #[error("Cancelled")]
+    Cancelled,
+
+    #[error(transparent)]
+    Upload(#[from] anyhow::Error),
+}
+
+/// The inner upload operation.  This will skip if `last_digest` is Some and matches the digest
+/// of the object we would have uploaded.
+#[instrument(skip_all, fields(tenant_id = %tenant.get_tenant_shard_id().tenant_id, shard_id = %tenant.get_tenant_shard_id().shard_slug()))]
+async fn upload_tenant_heatmap(
+    remote_storage: GenericRemoteStorage,
+    tenant: &Arc<Tenant>,
+    last_digest: Option<md5::Digest>,
+) -> Result<UploadHeatmapOutcome, UploadHeatmapError> {
+    debug_assert_current_span_has_tenant_id();
+
+    let generation = tenant.get_generation();
+    if generation.is_none() {
+        // We do not expect this: generations were implemented before heatmap uploads.  However,
+        // handle it so that we don't have to make the generation in the heatmap an Option<>
+        // (Generation::none is not serializable)
+        tracing::warn!("Skipping heatmap upload for tenant with generation==None");
+        return Ok(UploadHeatmapOutcome::Skipped);
+    }
+
+    let mut heatmap = HeatMapTenant {
+        timelines: Vec::new(),
+        generation,
+    };
+    let timelines = tenant.timelines.lock().unwrap().clone();
+
+    let tenant_cancel = tenant.cancel.clone();
+
+    // Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise
+    // when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind
+    // in remote storage.
+    let _guard = match tenant.gate.enter() {
+        Ok(g) => g,
+        Err(_) => {
+            tracing::info!("Skipping heatmap upload for tenant which is shutting down");
+            return Err(UploadHeatmapError::Cancelled);
+        }
+    };
+
+    for (timeline_id, timeline) in timelines {
+        let heatmap_timeline = timeline.generate_heatmap().await;
+        match heatmap_timeline {
+            None => {
+                tracing::debug!(
+                    "Skipping heatmap upload because timeline {timeline_id} is not ready"
+                );
+                return Ok(UploadHeatmapOutcome::Skipped);
+            }
+            Some(heatmap_timeline) => {
+                heatmap.timelines.push(heatmap_timeline);
+            }
+        }
+    }
+
+    // Serialize the heatmap
+    let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?;
+    let size = bytes.len();
+
+    // Drop out early if nothing changed since our last upload
+    let digest = md5::compute(&bytes);
+    if Some(digest) == last_digest {
+        return Ok(UploadHeatmapOutcome::NoChange);
+    }
+
+    let path = remote_heatmap_path(tenant.get_tenant_shard_id());
+
+    // Write the heatmap.
+    tracing::debug!("Uploading {size} byte heatmap to {path}");
+    if let Err(e) = backoff::retry(
+        || async {
+            let bytes = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from(
+                bytes.clone(),
+            ))));
+            remote_storage
+                .upload_storage_object(bytes, size, &path)
+                .await
+        },
+        |_| false,
+        3,
+        u32::MAX,
+        "Uploading heatmap",
+        backoff::Cancel::new(tenant_cancel.clone(), || anyhow::anyhow!("Shutting down")),
+    )
+    .await
+    {
+        if tenant_cancel.is_cancelled() {
+            return Err(UploadHeatmapError::Cancelled);
+        } else {
+            return Err(e.into());
+        }
+    }
+
+    tracing::info!("Successfully uploaded {size} byte heatmap to {path}");
+
+    Ok(UploadHeatmapOutcome::Uploaded(digest))
+}
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -259,8 +259,9 @@ impl Layer {

        layer
            .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
-            .instrument(tracing::info_span!("get_value_reconstruct_data", layer=%self))
+            .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
            .await
+            .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
    }

    /// Download the layer if evicted.
@@ -457,6 +458,8 @@ struct LayerInner {
    /// For loaded layers, this may be some other value if the tenant has undergone
    /// a shard split since the layer was originally written.
    shard: ShardIndex,
+
+    last_evicted_at: std::sync::Mutex<Option<std::time::Instant>>,
 }

 impl std::fmt::Display for LayerInner {
@@ -587,6 +590,7 @@ impl LayerInner {
            consecutive_failures: AtomicUsize::new(0),
            generation,
            shard,
+            last_evicted_at: std::sync::Mutex::default(),
        }
    }

@@ -651,7 +655,6 @@ impl LayerInner {
    }

    /// Cancellation safe.
-    #[tracing::instrument(skip_all, fields(layer=%self))]
    async fn get_or_maybe_download(
        self: &Arc<Self>,
        allow_download: bool,
@@ -660,87 +663,101 @@ impl LayerInner {
        let mut init_permit = None;

        loop {
-            let download = move |permit| async move {
-                // disable any scheduled but not yet running eviction deletions for this
-                let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
+            let download = move |permit| {
+                async move {
+                    // disable any scheduled but not yet running eviction deletions for this
+                    let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);

-                // count cancellations, which currently remain largely unexpected
-                let init_cancelled =
-                    scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+                    // count cancellations, which currently remain largely unexpected
+                    let init_cancelled =
+                        scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());

-                // no need to make the evict_and_wait wait for the actual download to complete
-                drop(self.status.send(Status::Downloaded));
+                    // no need to make the evict_and_wait wait for the actual download to complete
+                    drop(self.status.send(Status::Downloaded));

-                let timeline = self
-                    .timeline
-                    .upgrade()
-                    .ok_or_else(|| DownloadError::TimelineShutdown)?;
+                    let timeline = self
+                        .timeline
+                        .upgrade()
+                        .ok_or_else(|| DownloadError::TimelineShutdown)?;

-                // FIXME: grab a gate
+                    // FIXME: grab a gate

-                let can_ever_evict = timeline.remote_client.as_ref().is_some();
+                    let can_ever_evict = timeline.remote_client.as_ref().is_some();

-                // check if we really need to be downloaded; could have been already downloaded by a
-                // cancelled previous attempt.
-                let needs_download = self
-                    .needs_download()
-                    .await
-                    .map_err(DownloadError::PreStatFailed)?;
+                    // check if we really need to be downloaded; could have been already downloaded by a
+                    // cancelled previous attempt.
+                    let needs_download = self
+                        .needs_download()
+                        .await
+                        .map_err(DownloadError::PreStatFailed)?;

-                let permit = if let Some(reason) = needs_download {
-                    if let NeedsDownload::NotFile(ft) = reason {
-                        return Err(DownloadError::NotFile(ft));
+                    let permit = if let Some(reason) = needs_download {
+                        if let NeedsDownload::NotFile(ft) = reason {
+                            return Err(DownloadError::NotFile(ft));
+                        }
+
+                        // only reset this after we've decided we really need to download. otherwise it'd
+                        // be impossible to mark cancelled downloads for eviction, like one could imagine
+                        // we would like to do for prefetching which was not needed.
+                        self.wanted_evicted.store(false, Ordering::Release);
+
+                        if !can_ever_evict {
+                            return Err(DownloadError::NoRemoteStorage);
+                        }
+
+                        if let Some(ctx) = ctx {
+                            self.check_expected_download(ctx)?;
+                        }
+
+                        if !allow_download {
+                            // this does look weird, but for LayerInner the "downloading" means also changing
+                            // internal once related state ...
+                            return Err(DownloadError::DownloadRequired);
+                        }
+
+                        tracing::info!(%reason, "downloading on-demand");
+
+                        self.spawn_download_and_wait(timeline, permit).await?
+                    } else {
+                        // the file is present locally, probably by a previous but cancelled call to
+                        // get_or_maybe_download. alternatively we might be running without remote storage.
+                        LAYER_IMPL_METRICS.inc_init_needed_no_download();
+
+                        permit
+                    };
+
+                    let since_last_eviction =
+                        self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
+                    if let Some(since_last_eviction) = since_last_eviction {
+                        // FIXME: this will not always be recorded correctly until #6028 (the no
+                        // download needed branch above)
+                        LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
                    }

-                    // only reset this after we've decided we really need to download. otherwise it'd
-                    // be impossible to mark cancelled downloads for eviction, like one could imagine
-                    // we would like to do for prefetching which was not needed.
-                    self.wanted_evicted.store(false, Ordering::Release);
+                    let res = Arc::new(DownloadedLayer {
+                        owner: Arc::downgrade(self),
+                        kind: tokio::sync::OnceCell::default(),
+                        version: next_version,
+                    });

-                    if !can_ever_evict {
-                        return Err(DownloadError::NoRemoteStorage);
+                    self.access_stats.record_residence_event(
+                        LayerResidenceStatus::Resident,
+                        LayerResidenceEventReason::ResidenceChange,
+                    );
+
+                    let waiters = self.inner.initializer_count();
+                    if waiters > 0 {
+                        tracing::info!(
+                            waiters,
+                            "completing the on-demand download for other tasks"
+                        );
                    }

-                    if let Some(ctx) = ctx {
-                        self.check_expected_download(ctx)?;
-                    }
+                    scopeguard::ScopeGuard::into_inner(init_cancelled);

-                    if !allow_download {
-                        // this does look weird, but for LayerInner the "downloading" means also changing
-                        // internal once related state ...
-                        return Err(DownloadError::DownloadRequired);
-                    }
-
-                    tracing::info!(%reason, "downloading on-demand");
-
-                    self.spawn_download_and_wait(timeline, permit).await?
-                } else {
-                    // the file is present locally, probably by a previous but cancelled call to
-                    // get_or_maybe_download. alternatively we might be running without remote storage.
-                    LAYER_IMPL_METRICS.inc_init_needed_no_download();
-
-                    permit
-                };
-
-                let res = Arc::new(DownloadedLayer {
-                    owner: Arc::downgrade(self),
-                    kind: tokio::sync::OnceCell::default(),
-                    version: next_version,
-                });
-
-                self.access_stats.record_residence_event(
-                    LayerResidenceStatus::Resident,
-                    LayerResidenceEventReason::ResidenceChange,
-                );
-
-                let waiters = self.inner.initializer_count();
-                if waiters > 0 {
-                    tracing::info!(waiters, "completing the on-demand download for other tasks");
+                    Ok((ResidentOrWantedEvicted::Resident(res), permit))
                }
-
-                scopeguard::ScopeGuard::into_inner(init_cancelled);
-
-                Ok((ResidentOrWantedEvicted::Resident(res), permit))
+                .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
            };

            if let Some(init_permit) = init_permit.take() {
@@ -851,6 +868,7 @@ impl LayerInner {
                let result = client.download_layer_file(
                    &this.desc.filename(),
                    &this.metadata(),
+                    &crate::task_mgr::shutdown_token()
                )
                .await;

@@ -1117,6 +1135,8 @@ impl LayerInner {
        // we are still holding the permit, so no new spawn_download_and_wait can happen
        drop(self.status.send(Status::Evicted));

+        *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
+
        res
    }

@@ -1421,6 +1441,7 @@ pub(crate) struct LayerImplMetrics {

    rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
    inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
+    redownload_after: metrics::Histogram,
 }

 impl Default for LayerImplMetrics {
@@ -1496,6 +1517,26 @@ impl Default for LayerImplMetrics {
        )
        .unwrap();

+        let redownload_after = {
+            let minute = 60.0;
+            let hour = 60.0 * minute;
+            metrics::register_histogram!(
+                "pageserver_layer_redownloaded_after",
+                "Time between evicting and re-downloading.",
+                vec![
+                    10.0,
+                    30.0,
+                    minute,
+                    5.0 * minute,
+                    15.0 * minute,
+                    30.0 * minute,
+                    hour,
+                    12.0 * hour,
+                ]
+            )
+            .unwrap()
+        };
+
        Self {
            started_evictions,
            completed_evictions,
@@ -1507,6 +1548,7 @@ impl Default for LayerImplMetrics {

            rare_counters,
            inits_cancelled,
+            redownload_after,
        }
    }
 }
@@ -1574,6 +1616,10 @@ impl LayerImplMetrics {
    fn inc_init_cancelled(&self) {
        self.inits_cancelled.inc()
    }
+
+    fn record_redownloaded_after(&self, duration: std::time::Duration) {
+        self.redownload_after.observe(duration.as_secs_f64())
+    }
 }

 #[derive(enum_map::Enum)]
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -54,31 +54,18 @@ impl BackgroundLoopKind {
    }
 }

-pub(crate) enum RateLimitError {
-    Cancelled,
-}
-
-pub(crate) async fn concurrent_background_tasks_rate_limit(
+/// Cancellation safe.
+pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
    loop_kind: BackgroundLoopKind,
    _ctx: &RequestContext,
-    cancel: &CancellationToken,
-) -> Result<impl Drop, RateLimitError> {
-    crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT
+) -> impl Drop {
+    let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
        .with_label_values(&[loop_kind.as_static_str()])
-        .inc();
-    scopeguard::defer!(
-        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc();
-    );
-    tokio::select! {
-        permit = CONCURRENT_BACKGROUND_TASKS.acquire() => {
-            match permit {
-                Ok(permit) => Ok(permit),
-                Err(_closed) => unreachable!("we never close the semaphore"),
-            }
-        },
-        _ = cancel.cancelled() => {
-            Err(RateLimitError::Cancelled)
-        }
+        .guard();
+
+    match CONCURRENT_BACKGROUND_TASKS.acquire().await {
+        Ok(permit) => permit,
+        Err(_closed) => unreachable!("we never close the semaphore"),
    }
 }

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -29,7 +29,7 @@ use tokio::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::{id::TenantTimelineId, sync::gate::Gate};
+use utils::sync::gate::Gate;

 use std::collections::{BinaryHeap, HashMap, HashSet};
 use std::ops::{Deref, Range};
@@ -51,7 +51,7 @@ use crate::tenant::storage_layer::{
    LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult,
    ValueReconstructState,
 };
-use crate::tenant::tasks::{BackgroundLoopKind, RateLimitError};
+use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
    layer_map::{LayerMap, SearchResult},
@@ -98,8 +98,9 @@ use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

 use super::config::TenantConf;
-use super::remote_timeline_client::index::IndexPart;
+use super::remote_timeline_client::index::{IndexLayerMetadata, IndexPart};
 use super::remote_timeline_client::RemoteTimelineClient;
+use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};

 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
@@ -377,9 +378,6 @@ pub enum PageReconstructError {
    #[error(transparent)]
    Other(#[from] anyhow::Error),

-    /// The operation would require downloading a layer that is missing locally.
-    NeedsDownload(TenantTimelineId, LayerFileName),
-
    /// The operation was cancelled
    Cancelled,

@@ -408,14 +406,6 @@ impl std::fmt::Debug for PageReconstructError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
        match self {
            Self::Other(err) => err.fmt(f),
-            Self::NeedsDownload(tenant_timeline_id, layer_file_name) => {
-                write!(
-                    f,
-                    "layer {}/{} needs download",
-                    tenant_timeline_id,
-                    layer_file_name.file_name()
-                )
-            }
            Self::Cancelled => write!(f, "cancelled"),
            Self::AncestorStopping(timeline_id) => {
                write!(f, "ancestor timeline {timeline_id} is being stopped")
@@ -429,14 +419,6 @@ impl std::fmt::Display for PageReconstructError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
        match self {
            Self::Other(err) => err.fmt(f),
-            Self::NeedsDownload(tenant_timeline_id, layer_file_name) => {
-                write!(
-                    f,
-                    "layer {}/{} needs download",
-                    tenant_timeline_id,
-                    layer_file_name.file_name()
-                )
-            }
            Self::Cancelled => write!(f, "cancelled"),
            Self::AncestorStopping(timeline_id) => {
                write!(f, "ancestor timeline {timeline_id} is being stopped")
@@ -464,6 +446,12 @@ pub(crate) enum CompactFlags {
    ForceRepartition,
 }

+impl std::fmt::Debug for Timeline {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "Timeline<{}>", self.timeline_id)
+    }
+}
+
 /// Public interface functions
 impl Timeline {
    /// Get the LSN where this branch was created
@@ -727,19 +715,27 @@ impl Timeline {
        flags: EnumSet<CompactFlags>,
        ctx: &RequestContext,
    ) -> Result<(), CompactionError> {
-        let _g = self.compaction_lock.lock().await;
+        // most likely the cancellation token is from background task, but in tests it could be the
+        // request task as well.
+
+        let prepare = async move {
+            let guard = self.compaction_lock.lock().await;
+
+            let permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
+                BackgroundLoopKind::Compaction,
+                ctx,
+            )
+            .await;
+
+            (guard, permit)
+        };

        // this wait probably never needs any "long time spent" logging, because we already nag if
        // compaction task goes over it's period (20s) which is quite often in production.
-        let _permit = match super::tasks::concurrent_background_tasks_rate_limit(
-            BackgroundLoopKind::Compaction,
-            ctx,
-            cancel,
-        )
-        .await
-        {
-            Ok(permit) => permit,
-            Err(RateLimitError::Cancelled) => return Ok(()),
+        let (_guard, _permit) = tokio::select! {
+            tuple = prepare => { tuple },
+            _ = self.cancel.cancelled() => return Ok(()),
+            _ = cancel.cancelled() => return Ok(()),
        };

        let last_record_lsn = self.get_last_record_lsn();
@@ -1118,8 +1114,9 @@ impl Timeline {
        Ok(Some(true))
    }

-    /// Like [`evict_layer_batch`](Self::evict_layer_batch), but for just one layer.
-    /// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`.
+    /// Evict just one layer.
+    ///
+    /// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`.
    pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
        let _gate = self
            .gate
@@ -1130,109 +1127,17 @@ impl Timeline {
            return Ok(None);
        };

-        let Some(local_layer) = local_layer.keep_resident().await? else {
-            return Ok(Some(false));
-        };
-
-        let local_layer: Layer = local_layer.into();
-
-        let remote_client = self
+        let rtc = self
            .remote_client
            .as_ref()
            .ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?;

-        let results = self
-            .evict_layer_batch(remote_client, &[local_layer])
-            .await?;
-        assert_eq!(results.len(), 1);
-        let result: Option<Result<(), EvictionError>> = results.into_iter().next().unwrap();
-        match result {
-            None => anyhow::bail!("task_mgr shutdown requested"),
-            Some(Ok(())) => Ok(Some(true)),
-            Some(Err(e)) => Err(anyhow::Error::new(e)),
+        match local_layer.evict_and_wait(rtc).await {
+            Ok(()) => Ok(Some(true)),
+            Err(EvictionError::NotFound) => Ok(Some(false)),
+            Err(EvictionError::Downloaded) => Ok(Some(false)),
        }
    }
-
-    /// Evict a batch of layers.
-    pub(crate) async fn evict_layers(
-        &self,
-        layers_to_evict: &[Layer],
-    ) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
-        let _gate = self
-            .gate
-            .enter()
-            .map_err(|_| anyhow::anyhow!("Shutting down"))?;
-
-        let remote_client = self
-            .remote_client
-            .as_ref()
-            .context("timeline must have RemoteTimelineClient")?;
-
-        self.evict_layer_batch(remote_client, layers_to_evict).await
-    }
-
-    /// Evict multiple layers at once, continuing through errors.
-    ///
-    /// The `remote_client` should be this timeline's `self.remote_client`.
-    /// We make the caller provide it so that they are responsible for handling the case
-    /// where someone wants to evict the layer but no remote storage is configured.
-    ///
-    /// Returns either `Err()` or `Ok(results)` where `results.len() == layers_to_evict.len()`.
-    /// If `Err()` is returned, no eviction was attempted.
-    /// Each position of `Ok(results)` corresponds to the layer in `layers_to_evict`.
-    /// Meaning of each `result[i]`:
-    /// - `Some(Err(...))` if layer replacement failed for some reason
-    ///    - replacement failed for an expectable reason (e.g., layer removed by GC before we grabbed all locks)
-    /// - `Some(Ok(()))` if everything went well.
-    /// - `None` if no eviction attempt was made for the layer because `cancel.is_cancelled() == true`.
-    async fn evict_layer_batch(
-        &self,
-        remote_client: &Arc<RemoteTimelineClient>,
-        layers_to_evict: &[Layer],
-    ) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
-        {
-            // to avoid racing with detach and delete_timeline
-            let state = self.current_state();
-            anyhow::ensure!(
-                state == TimelineState::Active,
-                "timeline is not active but {state:?}"
-            );
-        }
-
-        let mut results = Vec::with_capacity(layers_to_evict.len());
-        for _ in 0..layers_to_evict.len() {
-            results.push(None);
-        }
-
-        let mut js = tokio::task::JoinSet::new();
-
-        for (i, l) in layers_to_evict.iter().enumerate() {
-            js.spawn({
-                let l = l.to_owned();
-                let remote_client = remote_client.clone();
-                async move { (i, l.evict_and_wait(&remote_client).await) }
-            });
-        }
-
-        let join = async {
-            while let Some(next) = js.join_next().await {
-                match next {
-                    Ok((i, res)) => results[i] = Some(res),
-                    Err(je) if je.is_cancelled() => unreachable!("not used"),
-                    Err(je) if je.is_panic() => { /* already logged */ }
-                    Err(je) => tracing::error!("unknown JoinError: {je:?}"),
-                }
-            }
-        };
-
-        tokio::select! {
-            _ = self.cancel.cancelled() => {},
-            _ = join => {}
-        }
-
-        assert_eq!(results.len(), layers_to_evict.len());
-        Ok(results)
-    }
 }

 /// Number of times we will compute partition within a checkpoint distance.
@@ -1829,6 +1734,7 @@ impl Timeline {
                self.current_logical_size.current_size().accuracy(),
                logical_size::Accuracy::Exact,
            );
+            self.current_logical_size.initialized.add_permits(1);
            return;
        };

@@ -1874,6 +1780,11 @@ impl Timeline {
        cancel: CancellationToken,
        background_ctx: RequestContext,
    ) {
+        scopeguard::defer! {
+            // Irrespective of the outcome of this operation, we should unblock anyone waiting for it.
+            self.current_logical_size.initialized.add_permits(1);
+        }
+
        enum BackgroundCalculationError {
            Cancelled,
            Other(anyhow::Error),
@@ -1885,22 +1796,22 @@ impl Timeline {
            let skip_concurrency_limiter = &skip_concurrency_limiter;
            async move {
                let cancel = task_mgr::shutdown_token();
-                let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit(
+                let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
                    BackgroundLoopKind::InitialLogicalSizeCalculation,
                    background_ctx,
-                    &cancel,
                );

                use crate::metrics::initial_logical_size::StartCircumstances;
                let (_maybe_permit, circumstances) = tokio::select! {
-                    res = wait_for_permit => {
-                        match res {
-                            Ok(permit) => (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit),
-                            Err(RateLimitError::Cancelled) => {
-                                return Err(BackgroundCalculationError::Cancelled);
-                            }
-                        }
+                    permit = wait_for_permit => {
+                        (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit)
                    }
+                    _ = self_ref.cancel.cancelled() => {
+                        return Err(BackgroundCalculationError::Cancelled);
+                    }
+                    _ = cancel.cancelled() => {
+                        return Err(BackgroundCalculationError::Cancelled);
+                    },
                    () = skip_concurrency_limiter.cancelled() => {
                        // Some action that is part of a end user interaction requested logical size
                        // => break out of the rate limit
@@ -2165,6 +2076,55 @@ impl Timeline {

        None
    }
+
+    /// The timeline heatmap is a hint to secondary locations from the primary location,
+    /// indicating which layers are currently on-disk on the primary.
+    ///
+    /// None is returned if the Timeline is in a state where uploading a heatmap
+    /// doesn't make sense, such as shutting down or initializing.  The caller
+    /// should treat this as a cue to simply skip doing any heatmap uploading
+    /// for this timeline.
+    pub(crate) async fn generate_heatmap(&self) -> Option<HeatMapTimeline> {
+        let eviction_info = self.get_local_layers_for_disk_usage_eviction().await;
+
+        let remote_client = match &self.remote_client {
+            Some(c) => c,
+            None => return None,
+        };
+
+        let layer_file_names = eviction_info
+            .resident_layers
+            .iter()
+            .map(|l| l.layer.layer_desc().filename())
+            .collect::<Vec<_>>();
+
+        let decorated = match remote_client.get_layers_metadata(layer_file_names) {
+            Ok(d) => d,
+            Err(_) => {
+                // Getting metadata only fails on Timeline in bad state.
+                return None;
+            }
+        };
+
+        let heatmap_layers = std::iter::zip(
+            eviction_info.resident_layers.into_iter(),
+            decorated.into_iter(),
+        )
+        .filter_map(|(layer, remote_info)| {
+            remote_info.map(|remote_info| {
+                HeatMapLayer::new(
+                    layer.layer.layer_desc().filename(),
+                    IndexLayerMetadata::from(remote_info),
+                    layer.last_activity_ts,
+                )
+            })
+        });
+
+        Some(HeatMapTimeline::new(
+            self.timeline_id,
+            heatmap_layers.collect(),
+        ))
+    }
 }

 type TraversalId = String;
@@ -3150,6 +3110,32 @@ impl Timeline {

        Ok(image_layers)
    }
+
+    /// Wait until the background initial logical size calculation is complete, or
+    /// this Timeline is shut down.  Calling this function will cause the initial
+    /// logical size calculation to skip waiting for the background jobs barrier.
+    pub(crate) async fn await_initial_logical_size(self: Arc<Self>) {
+        if let Some(await_bg_cancel) = self
+            .current_logical_size
+            .cancel_wait_for_background_loop_concurrency_limit_semaphore
+            .get()
+        {
+            await_bg_cancel.cancel();
+        } else {
+            // We should not wait if we were not able to explicitly instruct
+            // the logical size cancellation to skip the concurrency limit semaphore.
+            // TODO: this is an unexpected case.  We should restructure so that it
+            // can't happen.
+            tracing::info!(
+                "await_initial_logical_size: can't get semaphore cancel token, skipping"
+            );
+        }
+
+        tokio::select!(
+            _ = self.current_logical_size.initialized.acquire() => {},
+            _ = self.cancel.cancelled() => {}
+        )
+    }
 }

 #[derive(Default)]
@@ -3906,7 +3892,14 @@ impl Timeline {
    /// within a layer file. We can only remove the whole file if it's fully
    /// obsolete.
    pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
-        let _g = self.gc_lock.lock().await;
+        // this is most likely the background tasks, but it might be the spawned task from
+        // immediate_gc
+        let cancel = crate::task_mgr::shutdown_token();
+        let _g = tokio::select! {
+            guard = self.gc_lock.lock() => guard,
+            _ = self.cancel.cancelled() => return Ok(GcResult::default()),
+            _ = cancel.cancelled() => return Ok(GcResult::default()),
+        };
        let timer = self.metrics.garbage_collect_histo.start_timer();

        fail_point!("before-timeline-gc");
@@ -4605,7 +4598,7 @@ mod tests {
            .await
            .unwrap();

-        let rc = timeline
+        let rtc = timeline
            .remote_client
            .clone()
            .expect("just configured this");
@@ -4618,16 +4611,12 @@ mod tests {
            .expect("should had been resident")
            .drop_eviction_guard();

-        let batch = [layer];
-
-        let first = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() };
-        let second = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() };
+        let first = async { layer.evict_and_wait(&rtc).await };
+        let second = async { layer.evict_and_wait(&rtc).await };

        let (first, second) = tokio::join!(first, second);

-        let (first, second) = (only_one(first), only_one(second));
-
-        let res = batch[0].keep_resident().await;
+        let res = layer.keep_resident().await;
        assert!(matches!(res, Ok(None)), "{res:?}");

        match (first, second) {
@@ -4648,14 +4637,6 @@ mod tests {
        RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
    }

-    fn only_one<T>(mut input: Vec<Option<T>>) -> T {
-        assert_eq!(1, input.len());
-        input
-            .pop()
-            .expect("length just checked")
-            .expect("no cancellation")
-    }
-
    async fn find_some_layer(timeline: &Timeline) -> Layer {
        let layers = timeline.layers.read().await;
        let desc = layers
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,7 +30,7 @@ use crate::{
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
-        tasks::{BackgroundLoopKind, RateLimitError},
+        tasks::BackgroundLoopKind,
        timeline::EvictionError,
        LogicalSizeCalculationCause, Tenant,
    },
@@ -158,15 +158,15 @@ impl Timeline {
    ) -> ControlFlow<()> {
        let now = SystemTime::now();

-        let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit(
+        let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
            BackgroundLoopKind::Eviction,
            ctx,
-            cancel,
-        )
-        .await
-        {
-            Ok(permit) => permit,
-            Err(RateLimitError::Cancelled) => return ControlFlow::Break(()),
+        );
+
+        let _permit = tokio::select! {
+            permit = acquire_permit => permit,
+            _ = cancel.cancelled() => return ControlFlow::Break(()),
+            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
        };

        // If we evict layers but keep cached values derived from those layers, then
@@ -212,11 +212,21 @@ impl Timeline {
        // Gather layers for eviction.
        // NB: all the checks can be invalidated as soon as we release the layer map lock.
        // We don't want to hold the layer map lock during eviction.
+
        // So, we just need to deal with this.
-        let candidates: Vec<_> = {
+
+        let remote_client = match self.remote_client.as_ref() {
+            Some(c) => c,
+            None => {
+                error!("no remote storage configured, cannot evict layers");
+                return ControlFlow::Continue(());
+            }
+        };
+
+        let mut js = tokio::task::JoinSet::new();
+        {
            let guard = self.layers.read().await;
            let layers = guard.layer_map();
-            let mut candidates = Vec::new();
            for hist_layer in layers.iter_historic_layers() {
                let hist_layer = guard.get_from_desc(&hist_layer);

@@ -262,54 +272,49 @@ impl Timeline {
                        continue;
                    }
                };
+                let layer = guard.drop_eviction_guard();
                if no_activity_for > p.threshold {
-                    candidates.push(guard.drop_eviction_guard())
+                    let remote_client = remote_client.clone();
+                    // this could cause a lot of allocations in some cases
+                    js.spawn(async move { layer.evict_and_wait(&remote_client).await });
+                    stats.candidates += 1;
                }
            }
-            candidates
-        };
-        stats.candidates = candidates.len();
-
-        let remote_client = match self.remote_client.as_ref() {
-            None => {
-                error!(
-                    num_candidates = candidates.len(),
-                    "no remote storage configured, cannot evict layers"
-                );
-                return ControlFlow::Continue(());
-            }
-            Some(c) => c,
        };

-        let results = match self.evict_layer_batch(remote_client, &candidates).await {
-            Err(pre_err) => {
-                stats.errors += candidates.len();
-                error!("could not do any evictions: {pre_err:#}");
-                return ControlFlow::Continue(());
+        let join_all = async move {
+            while let Some(next) = js.join_next().await {
+                match next {
+                    Ok(Ok(())) => stats.evicted += 1,
+                    Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
+                        stats.not_evictable += 1;
+                    }
+                    Err(je) if je.is_cancelled() => unreachable!("not used"),
+                    Err(je) if je.is_panic() => {
+                        /* already logged */
+                        stats.errors += 1;
+                    }
+                    Err(je) => tracing::error!("unknown JoinError: {je:?}"),
+                }
            }
-            Ok(results) => results,
+            stats
        };
-        assert_eq!(results.len(), candidates.len());
-        for result in results {
-            match result {
-                None => {
-                    stats.skipped_for_shutdown += 1;
-                }
-                Some(Ok(())) => {
-                    stats.evicted += 1;
-                }
-                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
-                    stats.not_evictable += 1;
+
+        tokio::select! {
+            stats = join_all => {
+                if stats.candidates == stats.not_evictable {
+                    debug!(stats=?stats, "eviction iteration complete");
+                } else if stats.errors > 0 || stats.not_evictable > 0 {
+                    warn!(stats=?stats, "eviction iteration complete");
+                } else {
+                    info!(stats=?stats, "eviction iteration complete");
                }
            }
+            _ = cancel.cancelled() => {
+                // just drop the joinset to "abort"
+            }
        }
-        if stats.candidates == stats.not_evictable {
-            debug!(stats=?stats, "eviction iteration complete");
-        } else if stats.errors > 0 || stats.not_evictable > 0 {
-            warn!(stats=?stats, "eviction iteration complete");
-        } else {
-            info!(stats=?stats, "eviction iteration complete");
-        }
+
        ControlFlow::Continue(())
    }

--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -34,6 +34,9 @@ pub(super) struct LogicalSize {
    pub(crate) cancel_wait_for_background_loop_concurrency_limit_semaphore:
        OnceCell<CancellationToken>,

+    /// Once the initial logical size is initialized, this is notified.
+    pub(crate) initialized: tokio::sync::Semaphore,
+
    /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
    pub initial_part_end: Option<Lsn>,

@@ -125,6 +128,7 @@ impl LogicalSize {
            initial_part_end: None,
            size_added_after_initial: AtomicI64::new(0),
            did_return_approximate_to_walreceiver: AtomicBool::new(false),
+            initialized: tokio::sync::Semaphore::new(0),
        }
    }

@@ -135,6 +139,7 @@ impl LogicalSize {
            initial_part_end: Some(compute_to),
            size_added_after_initial: AtomicI64::new(0),
            did_return_approximate_to_walreceiver: AtomicBool::new(false),
+            initialized: tokio::sync::Semaphore::new(0),
        }
    }

--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -19,14 +19,14 @@ use super::Timeline;
 pub struct UninitializedTimeline<'t> {
    pub(crate) owning_tenant: &'t Tenant,
    timeline_id: TimelineId,
-    raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
+    raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark<'t>)>,
 }

 impl<'t> UninitializedTimeline<'t> {
    pub(crate) fn new(
        owning_tenant: &'t Tenant,
        timeline_id: TimelineId,
-        raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
+        raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark<'t>)>,
    ) -> Self {
        Self {
            owning_tenant,
@@ -169,18 +169,55 @@ pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
 ///
 /// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first.
 #[must_use]
-pub(crate) struct TimelineUninitMark {
+pub(crate) struct TimelineUninitMark<'t> {
+    owning_tenant: &'t Tenant,
+    timeline_id: TimelineId,
    uninit_mark_deleted: bool,
    uninit_mark_path: Utf8PathBuf,
    pub(crate) timeline_path: Utf8PathBuf,
 }

-impl TimelineUninitMark {
-    pub(crate) fn new(uninit_mark_path: Utf8PathBuf, timeline_path: Utf8PathBuf) -> Self {
-        Self {
-            uninit_mark_deleted: false,
-            uninit_mark_path,
-            timeline_path,
+/// Errors when acquiring exclusive access to a timeline ID for creation
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum TimelineExclusionError {
+    #[error("Already exists")]
+    AlreadyExists(Arc<Timeline>),
+    #[error("Already creating")]
+    AlreadyCreating,
+
+    // e.g. I/O errors, or some failure deep in postgres initdb
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl<'t> TimelineUninitMark<'t> {
+    pub(crate) fn new(
+        owning_tenant: &'t Tenant,
+        timeline_id: TimelineId,
+        uninit_mark_path: Utf8PathBuf,
+        timeline_path: Utf8PathBuf,
+    ) -> Result<Self, TimelineExclusionError> {
+        // Lock order: this is the only place we take both locks.  During drop() we only
+        // lock creating_timelines
+        let timelines = owning_tenant.timelines.lock().unwrap();
+        let mut creating_timelines: std::sync::MutexGuard<
+            '_,
+            std::collections::HashSet<TimelineId>,
+        > = owning_tenant.timelines_creating.lock().unwrap();
+
+        if let Some(existing) = timelines.get(&timeline_id) {
+            Err(TimelineExclusionError::AlreadyExists(existing.clone()))
+        } else if creating_timelines.contains(&timeline_id) {
+            Err(TimelineExclusionError::AlreadyCreating)
+        } else {
+            creating_timelines.insert(timeline_id);
+            Ok(Self {
+                owning_tenant,
+                timeline_id,
+                uninit_mark_deleted: false,
+                uninit_mark_path,
+                timeline_path,
+            })
        }
    }

@@ -207,7 +244,7 @@ impl TimelineUninitMark {
    }
 }

-impl Drop for TimelineUninitMark {
+impl Drop for TimelineUninitMark<'_> {
    fn drop(&mut self) {
        if !self.uninit_mark_deleted {
            if self.timeline_path.exists() {
@@ -226,5 +263,11 @@ impl Drop for TimelineUninitMark {
                }
            }
        }
+
+        self.owning_tenant
+            .timelines_creating
+            .lock()
+            .unwrap()
+            .remove(&self.timeline_id);
    }
 }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -2191,7 +2191,7 @@ mod tests {
            .load()
            .await;
        let tline = tenant
-            .bootstrap_timeline(TIMELINE_ID, pg_version, None, &ctx)
+            .bootstrap_timeline_test(TIMELINE_ID, pg_version, None, &ctx)
            .await
            .unwrap();

--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -19,20 +19,21 @@
 *-------------------------------------------------------------------------
 */
 #include "postgres.h"
+
+#include <curl/curl.h>
+
+#include "access/xact.h"
+#include "commands/defrem.h"
+#include "fmgr.h"
+#include "libpq/crypt.h"
+#include "miscadmin.h"
 #include "tcop/pquery.h"
 #include "tcop/utility.h"
-#include "access/xact.h"
+#include "utils/acl.h"
+#include "utils/guc.h"
 #include "utils/hsearch.h"
 #include "utils/memutils.h"
-#include "commands/defrem.h"
-#include "miscadmin.h"
-#include "utils/acl.h"
-#include "fmgr.h"
-#include "utils/guc.h"
-#include "port.h"
-#include <curl/curl.h>
 #include "utils/jsonb.h"
-#include "libpq/crypt.h"

 static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;

--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -1,4 +1,3 @@
-
 /*-------------------------------------------------------------------------
 *
 * extension_server.c
@@ -10,21 +9,11 @@
 *-------------------------------------------------------------------------
 */
 #include "postgres.h"
-#include "tcop/pquery.h"
-#include "tcop/utility.h"
-#include "access/xact.h"
-#include "utils/hsearch.h"
-#include "utils/memutils.h"
-#include "commands/defrem.h"
-#include "miscadmin.h"
-#include "utils/acl.h"
-#include "fmgr.h"
-#include "utils/guc.h"
-#include "port.h"
-#include "fmgr.h"

 #include <curl/curl.h>

+#include "utils/guc.h"
+
 static int	extension_server_port = 0;

 static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -13,32 +13,30 @@
 *-------------------------------------------------------------------------
 */

+#include "postgres.h"
+
 #include <sys/file.h>
 #include <unistd.h>
 #include <fcntl.h>

-#include "postgres.h"
-
 #include "neon_pgversioncompat.h"

+#include "access/parallel.h"
 #include "funcapi.h"
 #include "miscadmin.h"
-#include "pgstat.h"
 #include "pagestore_client.h"
-#include "access/parallel.h"
+#include "pgstat.h"
 #include "postmaster/bgworker.h"
 #include RELFILEINFO_HDR
 #include "storage/buf_internals.h"
-#include "storage/latch.h"
+#include "storage/fd.h"
 #include "storage/ipc.h"
+#include "storage/latch.h"
 #include "storage/lwlock.h"
+#include "storage/pg_shmem.h"
 #include "utils/builtins.h"
 #include "utils/dynahash.h"
 #include "utils/guc.h"
-#include "storage/fd.h"
-#include "storage/pg_shmem.h"
-#include "storage/buf_internals.h"
-#include "pgstat.h"

 /*
 * Local file cache is used to temporary store relations pages in local file system.
@@ -102,8 +100,6 @@ static shmem_request_hook_type prev_shmem_request_hook;

 #define LFC_ENABLED() (lfc_ctl->limit != 0)

-void		PGDLLEXPORT FileCacheMonitorMain(Datum main_arg);
-
 /*
 * Local file cache is optional and Neon can work without it.
 * In case of any any errors with this cache, we should disable it but to not throw error.
@@ -312,13 +308,13 @@ lfc_change_limit_hook(int newval, void *extra)
 		Assert(victim->access_count == 0);
 #ifdef FALLOC_FL_PUNCH_HOLE
 		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
-			elog(LOG, "Failed to punch hole in file: %m");
+			neon_log(LOG, "Failed to punch hole in file: %m");
 #endif
 		hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
 		lfc_ctl->used -= 1;
 	}
 	lfc_ctl->limit = new_size;
-	elog(DEBUG1, "set local file cache limit to %d", new_size);
+	neon_log(DEBUG1, "set local file cache limit to %d", new_size);

 	LWLockRelease(lfc_lock);
 }
@@ -331,7 +327,7 @@ lfc_init(void)
 	 * shared_preload_libraries.
 	 */
 	if (!process_shared_preload_libraries_in_progress)
-		elog(ERROR, "Neon module should be loaded via shared_preload_libraries");
+		neon_log(ERROR, "Neon module should be loaded via shared_preload_libraries");


 	DefineCustomIntVariable("neon.max_file_cache_size",
@@ -647,7 +643,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 			Assert(victim->access_count == 0);
 			entry->offset = victim->offset; /* grab victim's chunk */
 			hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
-			elog(DEBUG2, "Swap file cache page");
+			neon_log(DEBUG2, "Swap file cache page");
 		}
 		else
 		{
@@ -850,10 +846,10 @@ local_cache_pages(PG_FUNCTION_ARGS)
 		 * wrong) function definition though.
 		 */
 		if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
-			elog(ERROR, "return type must be a row type");
+			neon_log(ERROR, "return type must be a row type");

 		if (expected_tupledesc->natts != NUM_LOCALCACHE_PAGES_ELEM)
-			elog(ERROR, "incorrect number of output arguments");
+			neon_log(ERROR, "incorrect number of output arguments");

 		/* Construct a tuple descriptor for the result rows. */
 		tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -14,44 +14,30 @@
 */
 #include "postgres.h"

-#include "pagestore_client.h"
-#include "fmgr.h"
 #include "access/xlog.h"
-#include "access/xlogutils.h"
-#include "storage/buf_internals.h"
-#include "storage/lwlock.h"
-#include "storage/ipc.h"
-#include "storage/pg_shmem.h"
-#include "c.h"
-#include "postmaster/interrupt.h"
-
+#include "common/hashfn.h"
+#include "fmgr.h"
 #include "libpq-fe.h"
-#include "libpq/pqformat.h"
 #include "libpq/libpq.h"
-
+#include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "postmaster/interrupt.h"
+#include "storage/buf_internals.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/pg_shmem.h"
 #include "utils/guc.h"

 #include "neon.h"
-#include "walproposer.h"
 #include "neon_utils.h"
+#include "pagestore_client.h"
+#include "walproposer.h"

 #define PageStoreTrace DEBUG5

 #define RECONNECT_INTERVAL_USEC 1000000

-bool		connected = false;
-PGconn	   *pageserver_conn = NULL;
-
-/*
- * WaitEventSet containing:
- * - WL_SOCKET_READABLE on pageserver_conn,
- * - WL_LATCH_SET on MyLatch, and
- * - WL_EXIT_ON_PM_DEATH.
- */
-WaitEventSet *pageserver_conn_wes = NULL;
-
 /* GUCs */
 char	   *neon_timeline;
 char	   *neon_tenant;
@@ -62,89 +48,206 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;

-int			n_reconnect_attempts = 0;
-int			max_reconnect_attempts = 60;
+static int	n_reconnect_attempts = 0;
+static int	max_reconnect_attempts = 60;
+static int	stripe_size;

-#define MAX_PAGESERVER_CONNSTRING_SIZE 256
+bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
+
+static bool pageserver_flush(shardno_t shard_no);
+static void pageserver_disconnect(shardno_t shard_no);
+static void AssignPageserverConnstring(const char *newval, void *extra);
+static bool CheckPageserverConnstring(char **newval, void **extra, GucSource source);
+
+static shmem_startup_hook_type prev_shmem_startup_hook;
+#if PG_VERSION_NUM>=150000
+static shmem_request_hook_type prev_shmem_request_hook;
+#endif
+
+/*
+ * ShardMap is kept in shared memory. It contains the connection strings for
+ * each shard.
+ *
+ * There is "neon.pageserver_connstring" GUC with PGC_SIGHUP option, allowing to change it using
+ * pg_reload_conf(). It is used by control plane to update shards information if page server is crashed,
+ * relocated or new shards are added. This GUC variable contains comma separated list of connection strings.
+ * It is copied to shared memory because config can not be loaded during query execution and we need to
+ * reestablish connection to page server.
+ *
+ * So usually copying connection string to shared memory is done by postmaster. And other backends
+ * should check update counter to determine of connection URL is changed and connection needs to be reestablished.
+ *
+ * But at startup shared memory is not yet initialized and so we need to copy in some other process.
+ * Moreover, we can not use standard Postgres LW-locks, because postmaster has proc entry and so can not wait
+ * on this primitive. This is why lockless access algorithm is implemented using two atomic counters to enforce
+ * consistent reading of connection string value from shared memory.
+ */
+typedef struct
+{
+	size_t n_shards;
+	pg_atomic_uint64 begin_update_counter;
+	pg_atomic_uint64 end_update_counter;
+	char   shard_connstr[MAX_SHARDS][MAX_PS_CONNSTR_LEN];
+} ShardMap;
+
+
+static ShardMap* shard_map;
+static uint64    shard_map_update_counter;

 typedef struct
 {
-	LWLockId	lock;
-	pg_atomic_uint64 update_counter;
-	char		pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
-} PagestoreShmemState;
+	/*
+	 * Connection for each shard
+	 */
+	PGconn	   *conn;
+    /*
+	 * WaitEventSet containing:
+	 * - WL_SOCKET_READABLE on 'conn'
+	 * - WL_LATCH_SET on MyLatch, and
+	 * - WL_EXIT_ON_PM_DEATH.
+	 */
+	WaitEventSet    *wes;
+} PageServer;

-#if PG_VERSION_NUM >= 150000
-static shmem_request_hook_type prev_shmem_request_hook = NULL;
-static void walproposer_shmem_request(void);
+static PageServer page_servers[MAX_SHARDS];
+static shardno_t  max_attached_shard_no;
+
+static void
+psm_shmem_startup(void)
+{
+	bool found;
+	if (prev_shmem_startup_hook)
+	{
+		prev_shmem_startup_hook();
+	}
+
+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+
+	shard_map = (ShardMap*)ShmemInitStruct("shard_map", sizeof(ShardMap), &found);
+	if (!found)
+	{
+		shard_map->n_shards = 0;
+		pg_atomic_init_u64(&shard_map->begin_update_counter, 0);
+		pg_atomic_init_u64(&shard_map->end_update_counter, 0);
+		AssignPageserverConnstring(page_server_connstring, NULL);
+	}
+	LWLockRelease(AddinShmemInitLock);
+}
+
+static void
+psm_shmem_request(void)
+{
+#if PG_VERSION_NUM>=150000
+	if (prev_shmem_request_hook)
+		prev_shmem_request_hook();
 #endif
-static shmem_startup_hook_type prev_shmem_startup_hook;
-static PagestoreShmemState *pagestore_shared;
-static uint64 pagestore_local_counter = 0;
-static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];

-bool		(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
-
-static bool pageserver_flush(void);
-static void pageserver_disconnect(void);
-
-static bool
-PagestoreShmemIsValid()
-{
-	return pagestore_shared && UsedShmemSegAddr;
-}
-
-static bool
-CheckPageserverConnstring(char **newval, void **extra, GucSource source)
-{
-	return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
+	RequestAddinShmemSpace(sizeof(ShardMap));
 }

 static void
-AssignPageserverConnstring(const char *newval, void *extra)
+psm_init(void)
 {
-	if (!PagestoreShmemIsValid())
-		return;
-	LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
-	strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
-	pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
-	LWLockRelease(pagestore_shared->lock);
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = psm_shmem_startup;
+#if PG_VERSION_NUM>=150000
+	prev_shmem_request_hook = shmem_request_hook;
+	shmem_request_hook = psm_shmem_request;
+#else
+	psm_shmem_request();
+#endif
+}
+
+/*
+ * Reload page map if needed and return number of shards and connection string for the specified shard
+ * 'connstr' is an output buffer. If not NULL, it must point to a buffer at least MAX_PS_CONNSTR_LEN bytes
+ * long. The connection string for the gven shard is copied to it.
+ */
+static shardno_t
+load_shard_map(shardno_t shard_no, char* connstr)
+{
+	shardno_t n_shards;
+	uint64 begin_update_counter;
+	uint64 end_update_counter;
+
+	/*
+	 * There is race condition here between backend and postmaster which can update shard map.
+	 * We recheck update counter after copying shard map to check that configuration was not changed.
+	 */
+	do
+	{
+		begin_update_counter = pg_atomic_read_u64(&shard_map->begin_update_counter);
+		end_update_counter = pg_atomic_read_u64(&shard_map->end_update_counter);
+
+		n_shards = shard_map->n_shards;
+		if (shard_no >= n_shards)
+			neon_log(ERROR, "Shard %d is greater or equal than number of shards %d", shard_no, n_shards);
+
+		if (connstr)
+		{
+			/*
+			 * We need to use strlcpy here because due to race condition string oin shared memory
+			 * may be not zero terminated.
+			 */
+			strlcpy(connstr, shard_map->shard_connstr[shard_no], MAX_PS_CONNSTR_LEN);
+			pg_memory_barrier();
+		}
+
+	}
+	while (begin_update_counter != end_update_counter
+		   || begin_update_counter != pg_atomic_read_u64(&shard_map->begin_update_counter)
+		   || end_update_counter != pg_atomic_read_u64(&shard_map->end_update_counter));
+
+
+	if (shard_map_update_counter != end_update_counter)
+	{
+		/* Reset all connections if connection strings are changed */
+		for (shardno_t i = 0; i < max_attached_shard_no; i++)
+		{
+			if (page_servers[i].conn)
+				pageserver_disconnect(i);
+		}
+		max_attached_shard_no = 0;
+		shard_map_update_counter = end_update_counter;
+	}
+
+	return n_shards;
+}
+
+#define MB (1024*1024)
+
+shardno_t
+get_shard_number(BufferTag* tag)
+{
+	shardno_t n_shards = load_shard_map(0, NULL);
+	uint32	  hash;
+
+#if PG_MAJORVERSION_NUM < 16
+	hash = murmurhash32(tag->rnode.relNode);
+	hash = hash_combine(hash, murmurhash32(tag->blockNum/(MB/BLCKSZ)/stripe_size));
+#else
+	hash = murmurhash32(tag->relNumber);
+	hash = hash_combine(hash, murmurhash32(tag->blockNum/(MB/BLCKSZ)/stripe_size));
+#endif
+
+	return hash % n_shards;
 }

 static bool
-CheckConnstringUpdated()
-{
-	if (!PagestoreShmemIsValid())
-		return false;
-	return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
-}
-
-static void
-ReloadConnstring()
-{
-	if (!PagestoreShmemIsValid())
-		return;
-	LWLockAcquire(pagestore_shared->lock, LW_SHARED);
-	strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
-	pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
-	LWLockRelease(pagestore_shared->lock);
-}
-
-static bool
-pageserver_connect(int elevel)
+pageserver_connect(shardno_t shard_no, int elevel)
 {
 	char	   *query;
 	int			ret;
 	const char *keywords[3];
 	const char *values[3];
 	int			n;
+	PGconn*		conn;
+	WaitEventSet *wes;
+	char        connstr[MAX_PS_CONNSTR_LEN];

-	Assert(!connected);
+	Assert(page_servers[shard_no].conn == NULL);

-	if (CheckConnstringUpdated())
-	{
-		ReloadConnstring();
-	}
+	(void)load_shard_map(shard_no, connstr); /* refresh page map if needed */

 	/*
 	 * Connect using the connection string we got from the
@@ -164,50 +267,47 @@ pageserver_connect(int elevel)
 		n++;
 	}
 	keywords[n] = "dbname";
-	values[n] = local_pageserver_connstring;
+	values[n] = connstr;
 	n++;
 	keywords[n] = NULL;
 	values[n] = NULL;
 	n++;
-	pageserver_conn = PQconnectdbParams(keywords, values, 1);
+	conn = PQconnectdbParams(keywords, values, 1);

-	if (PQstatus(pageserver_conn) == CONNECTION_BAD)
+	if (PQstatus(conn) == CONNECTION_BAD)
 	{
-		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+		char	   *msg = pchomp(PQerrorMessage(conn));

-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
+		PQfinish(conn);

 		ereport(elevel,
 				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
-				 errmsg(NEON_TAG "could not establish connection to pageserver"),
+				 errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
 				 errdetail_internal("%s", msg)));
 		return false;
 	}
-
 	query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
-	ret = PQsendQuery(pageserver_conn, query);
+	ret = PQsendQuery(conn, query);
 	if (ret != 1)
 	{
-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
-		neon_log(elevel, "could not send pagestream command to pageserver");
+		PQfinish(conn);
+		neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
 		return false;
 	}

-	pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
-	AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
+	wes = CreateWaitEventSet(TopMemoryContext, 3);
+	AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
 					  MyLatch, NULL);
-	AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+	AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
 					  NULL, NULL);
-	AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);
+	AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL);

-	while (PQisBusy(pageserver_conn))
+	while (PQisBusy(conn))
 	{
 		WaitEvent	event;

 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+		(void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);

 		CHECK_FOR_INTERRUPTS();
@@ -215,25 +315,25 @@ pageserver_connect(int elevel)
 		/* Data available in socket? */
 		if (event.events & WL_SOCKET_READABLE)
 		{
-			if (!PQconsumeInput(pageserver_conn))
+			if (!PQconsumeInput(conn))
 			{
-				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+				char	   *msg = pchomp(PQerrorMessage(conn));

-				PQfinish(pageserver_conn);
-				pageserver_conn = NULL;
-				FreeWaitEventSet(pageserver_conn_wes);
-				pageserver_conn_wes = NULL;
+				PQfinish(conn);
+				FreeWaitEventSet(wes);

-				neon_log(elevel, "could not complete handshake with pageserver: %s",
-						 msg);
+				neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
+							   msg);
 				return false;
 			}
 		}
 	}

-	neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring);
+	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
+	page_servers[shard_no].conn = conn;
+	page_servers[shard_no].wes = wes;
+	max_attached_shard_no = Max(shard_no+1, max_attached_shard_no);

-	connected = true;
 	return true;
 }

@@ -241,10 +341,10 @@ pageserver_connect(int elevel)
 * A wrapper around PQgetCopyData that checks for interrupts while sleeping.
 */
 static int
-call_PQgetCopyData(char **buffer)
+call_PQgetCopyData(shardno_t shard_no, char **buffer)
 {
 	int			ret;
-
+	PGconn*     pageserver_conn = page_servers[shard_no].conn;
 retry:
 	ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );

@@ -253,7 +353,7 @@ retry:
 		WaitEvent	event;

 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+		(void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);

 		CHECK_FOR_INTERRUPTS();
@@ -265,7 +365,7 @@ retry:
 			{
 				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));

-				neon_log(LOG, "could not get response from pageserver: %s", msg);
+				neon_shard_log(shard_no, LOG, "could not get response from pageserver: %s", msg);
 				pfree(msg);
 				return -1;
 			}
@@ -279,7 +379,7 @@ retry:


 static void
-pageserver_disconnect(void)
+pageserver_disconnect(shardno_t shard_no)
 {
 	/*
 	 * If anything goes wrong while we were sending a request, it's not clear
@@ -288,38 +388,32 @@ pageserver_disconnect(void)
 	 * time later after we have already sent a new unrelated request. Close
 	 * the connection to avoid getting confused.
 	 */
-	if (connected)
+	if (page_servers[shard_no].conn)
 	{
-		neon_log(LOG, "dropping connection to page server due to error");
-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
-		connected = false;
+		neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
+		PQfinish(page_servers[shard_no].conn);
+		page_servers[shard_no].conn = NULL;

 		prefetch_on_ps_disconnect();
 	}
-	if (pageserver_conn_wes != NULL)
+	if (page_servers[shard_no].wes != NULL)
 	{
-		FreeWaitEventSet(pageserver_conn_wes);
-		pageserver_conn_wes = NULL;
+		FreeWaitEventSet(page_servers[shard_no].wes);
+		page_servers[shard_no].wes = NULL;
 	}
 }

 static bool
-pageserver_send(NeonRequest *request)
+pageserver_send(shardno_t shard_no, NeonRequest *request)
 {
 	StringInfoData req_buff;
-
-	if (CheckConnstringUpdated())
-	{
-		pageserver_disconnect();
-		ReloadConnstring();
-	}
+	PGconn* pageserver_conn = page_servers[shard_no].conn;

 	/* If the connection was lost for some reason, reconnect */
-	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
+	if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD)
 	{
-		neon_log(LOG, "pageserver_send disconnect bad connection");
-		pageserver_disconnect();
+		neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection");
+		pageserver_disconnect(shard_no);
 	}

 	req_buff = nm_pack_request(request);
@@ -333,9 +427,9 @@ pageserver_send(NeonRequest *request)
 	 * https://github.com/neondatabase/neon/issues/1138 So try to reestablish
 	 * connection in case of failure.
 	 */
-	if (!connected)
+	if (!page_servers[shard_no].conn)
 	{
-		while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
+		while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
 		{
 			HandleMainLoopInterrupts();
 			n_reconnect_attempts += 1;
@@ -344,7 +438,9 @@ pageserver_send(NeonRequest *request)
 		n_reconnect_attempts = 0;
 	}

-	/*
+	pageserver_conn = page_servers[shard_no].conn;
+
+    /*
 	 * Send request.
 	 *
 	 * In principle, this could block if the output buffer is full, and we
@@ -355,9 +451,8 @@ pageserver_send(NeonRequest *request)
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-
-		pageserver_disconnect();
-		neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
+		pageserver_disconnect(shard_no);
+		neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
 		pfree(msg);
 		pfree(req_buff.data);
 		return false;
@@ -369,19 +464,19 @@ pageserver_send(NeonRequest *request)
 	{
 		char	   *msg = nm_to_string((NeonMessage *) request);

-		neon_log(PageStoreTrace, "sent request: %s", msg);
+		neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg);
 		pfree(msg);
 	}
 	return true;
 }

 static NeonResponse *
-pageserver_receive(void)
+pageserver_receive(shardno_t shard_no)
 {
 	StringInfoData resp_buff;
 	NeonResponse *resp;
-
-	if (!connected)
+	PGconn* pageserver_conn = page_servers[shard_no].conn;
+	if (!pageserver_conn)
 		return NULL;

 	PG_TRY();
@@ -389,7 +484,7 @@ pageserver_receive(void)
 		/* read response */
 		int			rc;

-		rc = call_PQgetCopyData(&resp_buff.data);
+		rc = call_PQgetCopyData(shard_no, &resp_buff.data);
 		if (rc >= 0)
 		{
 			resp_buff.len = rc;
@@ -401,33 +496,33 @@ pageserver_receive(void)
 			{
 				char	   *msg = nm_to_string((NeonMessage *) resp);

-				neon_log(PageStoreTrace, "got response: %s", msg);
+				neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
 				pfree(msg);
 			}
 		}
 		else if (rc == -1)
 		{
-			neon_log(LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
-			pageserver_disconnect();
+			neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
+			pageserver_disconnect(shard_no);
 			resp = NULL;
 		}
 		else if (rc == -2)
 		{
 			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));

-			pageserver_disconnect();
-			neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
+			pageserver_disconnect(shard_no);
+			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
 		}
 		else
 		{
-			pageserver_disconnect();
-			neon_log(ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
+			pageserver_disconnect(shard_no);
+			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
 		}
 	}
 	PG_CATCH();
 	{
-		neon_log(LOG, "pageserver_receive disconnect due to caught exception");
-		pageserver_disconnect();
+		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception");
+		pageserver_disconnect(shard_no);
 		PG_RE_THROW();
 	}
 	PG_END_TRY();
@@ -437,11 +532,12 @@ pageserver_receive(void)


 static bool
-pageserver_flush(void)
+pageserver_flush(shardno_t shard_no)
 {
-	if (!connected)
+	PGconn* pageserver_conn = page_servers[shard_no].conn;
+	if (!pageserver_conn)
 	{
-		neon_log(WARNING, "Tried to flush while disconnected");
+		neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected");
 	}
 	else
 	{
@@ -449,8 +545,8 @@ pageserver_flush(void)
 		{
 			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));

-			pageserver_disconnect();
-			neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
+			pageserver_disconnect(shard_no);
+			neon_shard_log(shard_no, LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
 			pfree(msg);
 			return false;
 		}
@@ -473,63 +569,83 @@ check_neon_id(char **newval, void **extra, GucSource source)
 	return **newval == '\0' || HexDecodeString(id, *newval, 16);
 }

-static Size
-PagestoreShmemSize(void)
-{
-	return sizeof(PagestoreShmemState);
-}
-
 static bool
-PagestoreShmemInit(void)
+CheckPageserverConnstring(char **newval, void **extra, GucSource source)
 {
-	bool		found;
-
-	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
-	pagestore_shared = ShmemInitStruct("libpagestore shared state",
-									   PagestoreShmemSize(),
-									   &found);
-	if (!found)
+	const char* shard_connstr = *newval;
+	const char* sep;
+	size_t connstr_len;
+	int i = 0;
+	do
 	{
-		pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
-		pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
-		AssignPageserverConnstring(page_server_connstring, NULL);
+		sep = strchr(shard_connstr, ',');
+		connstr_len = sep != NULL ? sep - shard_connstr : strlen(shard_connstr);
+		if (connstr_len == 0)
+			break; /* trailing comma */
+		if (i >= MAX_SHARDS)
+		{
+			neon_log(LOG, "Too many shards");
+			return false;
+		}
+		if (connstr_len >= MAX_PS_CONNSTR_LEN)
+		{
+			neon_log(LOG, "Connection string too long");
+			return false;
+		}
+		shard_connstr = sep + 1;
+		i += 1;
+	} while (sep != NULL);
+
+	return true;
+}
+
+static void
+AssignPageserverConnstring(const char *newval, void *extra)
+{
+	/*
+	 * Load shard map only at Postmaster.
+	 * If old page server is not available, then backends can be blocked in attempts to reconnect to it and do not reload config in this loop
+	 *
+	 * Copying GUC value to shared memory is usually performed by postmaster. But in case of startup,
+	 * shared memory is not yet initialized. So it has to be performed by any other process.
+	 * It is not a problem if more than one process do this initialization.
+	 */
+	if (shard_map != NULL && UsedShmemSegAddr != NULL && (MyProcPid == PostmasterPid || shard_map->n_shards == 0))
+	{
+		const char* shard_connstr = newval;
+		const char* sep;
+		size_t connstr_len;
+		int i = 0;
+		bool shard_map_changed = false;
+		do
+		{
+			sep = strchr(shard_connstr, ',');
+			connstr_len = sep != NULL ? sep - shard_connstr : strlen(shard_connstr);
+			if (connstr_len == 0)
+				break; /* trailing comma */
+			Assert(i < MAX_SHARDS);
+			Assert(connstr_len < MAX_PS_CONNSTR_LEN);
+			if (i >= shard_map->n_shards ||
+				strcmp(shard_map->shard_connstr[i], shard_connstr) != 0)
+			{
+				if (!shard_map_changed)
+				{
+					pg_atomic_add_fetch_u64(&shard_map->begin_update_counter, 1);
+					shard_map_changed = true;
+				}
+				memcpy(shard_map->shard_connstr[i], shard_connstr, connstr_len+1);
+			}
+			shard_connstr = sep + 1;
+			i += 1;
+		} while (sep != NULL);
+
+		if (shard_map_changed)
+		{
+			shard_map->n_shards = i;
+			pg_memory_barrier();
+			pg_atomic_add_fetch_u64(&shard_map->end_update_counter, 1);
+		}
 	}
-	LWLockRelease(AddinShmemInitLock);
-	return found;
-}
-
-static void
-pagestore_shmem_startup_hook(void)
-{
-	if (prev_shmem_startup_hook)
-		prev_shmem_startup_hook();
-
-	PagestoreShmemInit();
-}
-
-static void
-pagestore_shmem_request(void)
-{
-#if PG_VERSION_NUM >= 150000
-	if (prev_shmem_request_hook)
-		prev_shmem_request_hook();
-#endif
-
-	RequestAddinShmemSpace(PagestoreShmemSize());
-	RequestNamedLWLockTranche("neon_libpagestore", 1);
-}
-
-static void
-pagestore_prepare_shmem(void)
-{
-#if PG_VERSION_NUM >= 150000
-	prev_shmem_request_hook = shmem_request_hook;
-	shmem_request_hook = pagestore_shmem_request;
-#else
-	pagestore_shmem_request();
-#endif
-	prev_shmem_startup_hook = shmem_startup_hook;
-	shmem_startup_hook = pagestore_shmem_startup_hook;
 }

 /*
@@ -538,8 +654,6 @@ pagestore_prepare_shmem(void)
 void
 pg_init_libpagestore(void)
 {
-	pagestore_prepare_shmem();
-
 	DefineCustomStringVariable("neon.pageserver_connstring",
 							   "connection string to the page server",
 							   NULL,
@@ -567,6 +681,15 @@ pg_init_libpagestore(void)
 							   0,	/* no flags required */
 							   check_neon_id, NULL, NULL);

+	DefineCustomIntVariable("neon.stripe_size",
+							"sharding stripe size",
+							NULL,
+							&stripe_size,
+							32768, 1, INT_MAX,
+							PGC_SIGHUP,
+							GUC_UNIT_BLOCKS,
+							NULL, NULL, NULL);
+
 	DefineCustomIntVariable("neon.max_cluster_size",
 							"cluster size limit",
 							NULL,
@@ -627,9 +750,8 @@ pg_init_libpagestore(void)
 		smgr_hook = smgr_neon;
 		smgr_init_hook = smgr_init_neon;
 		dbsize_hook = neon_dbsize;
-		old_redo_read_buffer_filter = redo_read_buffer_filter;
-		redo_read_buffer_filter = neon_redo_read_buffer_filter;
 	}

 	lfc_init();
+	psm_init();
 }
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -27,13 +27,6 @@ extern void pg_init_walproposer(void);

 extern void pg_init_extension_server(void);

-/*
- * Returns true if we shouldn't do REDO on that block in record indicated by
- * block_id; false otherwise.
- */
-extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
-extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
-
 extern uint64 BackpressureThrottlingTime(void);
 extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);

--- a/pgxn/neon/neon_utils.c
+++ b/pgxn/neon/neon_utils.c
@@ -1,32 +1,10 @@
+
+#include <sys/resource.h>
+
 #include "postgres.h"

-#include "access/timeline.h"
-#include "access/xlogutils.h"
-#include "common/logging.h"
-#include "common/ip.h"
-#include "funcapi.h"
-#include "libpq/libpq.h"
+#include "lib/stringinfo.h"
 #include "libpq/pqformat.h"
-#include "miscadmin.h"
-#include "postmaster/interrupt.h"
-#include "replication/slot.h"
-#include "replication/walsender_private.h"
-
-#include "storage/ipc.h"
-#include "utils/builtins.h"
-#include "utils/ps_status.h"
-
-#include "libpq-fe.h"
-#include <netinet/tcp.h>
-#include <unistd.h>
-
-#if PG_VERSION_NUM >= 150000
-#include "access/xlogutils.h"
-#include "access/xlogrecovery.h"
-#endif
-#if PG_MAJORVERSION_NUM >= 16
-#include "utils/guc.h"
-#endif

 /*
 * Convert a character which represents a hexadecimal digit to an integer.
@@ -114,3 +92,25 @@ pq_sendint64_le(StringInfo buf, uint64 i)
 	memcpy(buf->data + buf->len, &i, sizeof(uint64));
 	buf->len += sizeof(uint64);
 }
+
+/*
+ * Disables core dump for the current process.
+ */
+void
+disable_core_dump()
+{
+	struct rlimit rlim;
+
+#ifdef WALPROPOSER_LIB			/* skip in simulation mode */
+	return;
+#endif
+
+	rlim.rlim_cur = 0;
+	rlim.rlim_max = 0;
+	if (setrlimit(RLIMIT_CORE, &rlim))
+	{
+		int			save_errno = errno;
+
+		fprintf(stderr, "WARNING: disable cores setrlimit failed: %s", strerror(save_errno));
+	}
+}
--- a/pgxn/neon/neon_utils.h
+++ b/pgxn/neon/neon_utils.h
@@ -1,12 +1,11 @@
 #ifndef __NEON_UTILS_H__
 #define __NEON_UTILS_H__

-#include "postgres.h"
-
 bool		HexDecodeString(uint8 *result, char *input, int nbytes);
 uint32		pq_getmsgint32_le(StringInfo msg);
 uint64		pq_getmsgint64_le(StringInfo msg);
 void		pq_sendint32_le(StringInfo buf, uint32 i);
 void		pq_sendint64_le(StringInfo buf, uint64 i);
+extern void disable_core_dump();

 #endif							/* __NEON_UTILS_H__ */
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -13,19 +13,24 @@
 #ifndef pageserver_h
 #define pageserver_h

-#include "postgres.h"
 #include "neon_pgversioncompat.h"

 #include "access/xlogdefs.h"
 #include RELFILEINFO_HDR
 #include "storage/block.h"
 #include "storage/smgr.h"
+#include "storage/buf_internals.h"
 #include "lib/stringinfo.h"
 #include "libpq/pqformat.h"
+#include "storage/block.h"
+#include "storage/smgr.h"
 #include "utils/memutils.h"

 #include "pg_config.h"

+#define MAX_SHARDS 128
+#define MAX_PS_CONNSTR_LEN 128
+
 typedef enum
 {
 	/* pagestore_client -> pagestore */
@@ -54,6 +59,9 @@ typedef struct
 #define neon_log(tag, fmt, ...) ereport(tag,                                  \
 										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
 										 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
+#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag,	\
+														(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
+														 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))

 /*
 * supertype of all the Neon*Request structs below
@@ -144,11 +152,13 @@ extern char *nm_to_string(NeonMessage *msg);
 * API
 */

+typedef unsigned shardno_t;
+
 typedef struct
 {
-	bool		(*send) (NeonRequest *request);
-	NeonResponse *(*receive) (void);
-	bool		(*flush) (void);
+	bool		(*send) (shardno_t  shard_no, NeonRequest * request);
+	NeonResponse *(*receive) (shardno_t shard_no);
+	bool		(*flush) (shardno_t shard_no);
 } page_server_api;

 extern void prefetch_on_ps_disconnect(void);
@@ -158,13 +168,12 @@ extern page_server_api *page_server;
 extern char *page_server_connstring;
 extern int	flush_every_n_requests;
 extern int	readahead_buffer_size;
-extern bool seqscan_prefetch_enabled;
-extern int	seqscan_prefetch_distance;
 extern char *neon_timeline;
 extern char *neon_tenant;
-extern bool wal_redo;
 extern int32 max_cluster_size;

+extern shardno_t get_shard_number(BufferTag* tag);
+
 extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo);
 extern void smgr_init_neon(void);
 extern void readahead_buffer_resize(int newsize, void *extra);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -47,25 +47,26 @@

 #include "access/xact.h"
 #include "access/xlog.h"
+#include "access/xlogdefs.h"
 #include "access/xloginsert.h"
 #include "access/xlog_internal.h"
-#include "access/xlogdefs.h"
+#include "access/xlogutils.h"
 #include "catalog/pg_class.h"
 #include "common/hashfn.h"
 #include "executor/instrument.h"
-#include "pagestore_client.h"
-#include "postmaster/interrupt.h"
+#include "pgstat.h"
 #include "postmaster/autovacuum.h"
+#include "postmaster/interrupt.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
 #include "storage/buf_internals.h"
 #include "storage/fsm_internals.h"
-#include "storage/smgr.h"
 #include "storage/md.h"
-#include "pgstat.h"
+#include "storage/smgr.h"
+
+#include "pagestore_client.h"

 #if PG_VERSION_NUM >= 150000
-#include "access/xlogutils.h"
 #include "access/xlogrecovery.h"
 #endif

@@ -106,6 +107,9 @@ typedef enum
 static SMgrRelation unlogged_build_rel = NULL;
 static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;

+static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
+static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
+
 /*
 * Prefetch implementation:
 *
@@ -168,6 +172,7 @@ typedef struct PrefetchRequest
 	XLogRecPtr	actual_request_lsn;
 	NeonResponse *response;		/* may be null */
 	PrefetchStatus status;
+	shardno_t   shard_no;
 	uint64		my_ring_index;
 } PrefetchRequest;

@@ -235,11 +240,13 @@ typedef struct PrefetchState
 								 * also unused */

 	/* the buffers */
-	prfh_hash  *prf_hash;
+	prfh_hash	*prf_hash;
+	int			max_shard_no;
+	uint8		shard_bitmap[(MAX_SHARDS + 7)/8];
 	PrefetchRequest prf_buffer[];	/* prefetch buffers */
 } PrefetchState;

-PrefetchState *MyPState;
+static PrefetchState *MyPState;

 #define GetPrfSlot(ring_index) ( \
 	( \
@@ -257,7 +264,7 @@ PrefetchState *MyPState;
 	) \
 )

-XLogRecPtr	prefetch_lsn = 0;
+static XLogRecPtr prefetch_lsn = 0;

 static bool compact_prefetch_buffers(void);
 static void consume_prefetch_responses(void);
@@ -323,6 +330,7 @@ compact_prefetch_buffers(void)
 		Assert(target_slot->status == PRFS_UNUSED);

 		target_slot->buftag = source_slot->buftag;
+		target_slot->shard_no = source_slot->shard_no;
 		target_slot->status = source_slot->status;
 		target_slot->response = source_slot->response;
 		target_slot->effective_request_lsn = source_slot->effective_request_lsn;
@@ -490,6 +498,23 @@ prefetch_cleanup_trailing_unused(void)
 	}
 }

+
+static bool
+prefetch_flush_requests(void)
+{
+	for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++)
+	{
+		if (MyPState->shard_bitmap[shard_no >> 3] & (1 << (shard_no & 7)))
+		{
+			if (!page_server->flush(shard_no))
+				return false;
+			MyPState->shard_bitmap[shard_no >> 3] &= ~(1 << (shard_no & 7));
+		}
+	}
+	MyPState->max_shard_no = 0;
+	return true;
+}
+
 /*
 * Wait for slot of ring_index to have received its response.
 * The caller is responsible for making sure the request buffer is flushed.
@@ -505,7 +530,7 @@ prefetch_wait_for(uint64 ring_index)
 	if (MyPState->ring_flush <= ring_index &&
 		MyPState->ring_unused > MyPState->ring_flush)
 	{
-		if (!page_server->flush())
+		if (!prefetch_flush_requests())
 			return false;
 		MyPState->ring_flush = MyPState->ring_unused;
 	}
@@ -543,7 +568,7 @@ prefetch_read(PrefetchRequest *slot)
 	Assert(slot->my_ring_index == MyPState->ring_receive);

 	old = MemoryContextSwitchTo(MyPState->errctx);
-	response = (NeonResponse *) page_server->receive();
+	response = (NeonResponse *) page_server->receive(slot->shard_no);
 	MemoryContextSwitchTo(old);
 	if (response)
 	{
@@ -700,12 +725,14 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);

-	while (!page_server->send((NeonRequest *) &request));
+	while (!page_server->send(slot->shard_no, (NeonRequest *) &request));

 	/* update prefetch state */
 	MyPState->n_requests_inflight += 1;
 	MyPState->n_unused -= 1;
 	MyPState->ring_unused += 1;
+	MyPState->shard_bitmap[slot->shard_no >> 3] |= 1 << (slot->shard_no & 7);
+	MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no);

 	/* update slot state */
 	slot->status = PRFS_REQUESTED;
@@ -876,6 +903,7 @@ Retry:
 	 * function reads the buffer tag from the slot.
 	 */
 	slot->buftag = tag;
+	slot->shard_no = get_shard_number(&tag);
 	slot->my_ring_index = ring_index;

 	prefetch_do_request(slot, force_latest, force_lsn);
@@ -886,7 +914,7 @@ Retry:
 	if (flush_every_n_requests > 0 &&
 		MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
 	{
-		if (!page_server->flush())
+		if (!prefetch_flush_requests())
 		{
 			/*
 			 * Prefetch set is reset in case of error, so we should try to
@@ -904,13 +932,44 @@ static NeonResponse *
 page_server_request(void const *req)
 {
 	NeonResponse *resp;
+	BufferTag tag = {0};
+	shardno_t shard_no;
+
+	switch (((NeonRequest *) req)->tag)
+	{
+		case T_NeonExistsRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo);
+			break;
+		case T_NeonNblocksRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo);
+			break;
+		case T_NeonDbSizeRequest:
+			NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode;
+			break;
+		case T_NeonGetPageRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo);
+			tag.blockNum = ((NeonGetPageRequest *) req)->blkno;
+			break;
+		default:
+			neon_log(ERROR, "Unexpected request tag: %d", ((NeonRequest *) req)->tag);
+	}
+	shard_no = get_shard_number(&tag);
+
+
+	/*
+	 * Current sharding model assumes that all metadata is present only at shard 0.
+	 * We still need to call get_shard_no() to check if shard map is up-to-date.
+	 */
+	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
+	{
+		shard_no = 0;
+	}

 	do
 	{
-		while (!page_server->send((NeonRequest *) req) || !page_server->flush());
-		MyPState->ring_flush = MyPState->ring_unused;
+		while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no));
 		consume_prefetch_responses();
-		resp = page_server->receive();
+		resp = page_server->receive(shard_no);
 	} while (resp == NULL);
 	return resp;

@@ -986,7 +1045,7 @@ nm_pack_request(NeonRequest *msg)
 		case T_NeonErrorResponse:
 		case T_NeonDbSizeResponse:
 		default:
-			elog(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
+			neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
 			break;
 	}
 	return s;
@@ -1081,7 +1140,7 @@ nm_unpack_response(StringInfo s)
 		case T_NeonGetPageRequest:
 		case T_NeonDbSizeRequest:
 		default:
-			elog(ERROR, "unexpected neon message tag 0x%02x", tag);
+			neon_log(ERROR, "unexpected neon message tag 0x%02x", tag);
 			break;
 	}

@@ -1273,7 +1332,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 		XLogFlush(recptr);
 		lsn = recptr;
 		ereport(SmgrTrace,
-				(errmsg("Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
+				(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
 						blocknum,
 						RelFileInfoFmt(InfoFromSMgrRel(reln)),
 						forknum, LSN_FORMAT_ARGS(lsn))));
@@ -1301,7 +1360,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 		if (PageIsNew((Page) buffer))
 		{
 			ereport(SmgrTrace,
-					(errmsg("Page %u of relation %u/%u/%u.%u is all-zeros",
+					(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is all-zeros",
 							blocknum,
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
@@ -1309,7 +1368,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 		else if (PageIsEmptyHeapPage((Page) buffer))
 		{
 			ereport(SmgrTrace,
-					(errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
+					(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
 							blocknum,
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
@@ -1317,7 +1376,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 		else
 		{
 			ereport(PANIC,
-					(errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
+					(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
 							blocknum,
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
@@ -1326,7 +1385,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 	else
 	{
 		ereport(SmgrTrace,
-				(errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
+				(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
 						blocknum,
 						RelFileInfoFmt(InfoFromSMgrRel(reln)),
 						forknum, LSN_FORMAT_ARGS(lsn))));
@@ -1371,6 +1430,9 @@ neon_init(void)
 	MyPState->prf_hash = prfh_create(MyPState->hashctx,
 									 readahead_buffer_size, NULL);

+	old_redo_read_buffer_filter = redo_read_buffer_filter;
+	redo_read_buffer_filter = neon_redo_read_buffer_filter;
+
 #ifdef DEBUG_COMPARE_LOCAL
 	mdinit();
 #endif
@@ -1423,7 +1485,7 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
 		lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
 		lsn = nm_adjust_lsn(lsn);

-		elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
+		neon_log(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
 			 (uint32) ((lsn) >> 32), (uint32) (lsn));
 	}
 	else
@@ -1438,7 +1500,7 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
 		*latest = true;
 		lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
 		Assert(lsn != InvalidXLogRecPtr);
-		elog(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
+		neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
 			 (uint32) ((lsn) >> 32), (uint32) (lsn));

 		lsn = nm_adjust_lsn(lsn);
@@ -1458,7 +1520,7 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
 #endif
 		if (lsn > flushlsn)
 		{
-			elog(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
+			neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
 				 (uint32) (lsn >> 32), (uint32) lsn,
 				 (uint32) (flushlsn >> 32), (uint32) flushlsn);
 			XLogFlush(lsn);
@@ -1502,7 +1564,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 			return mdexists(reln, forkNum);

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

 	if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks))
@@ -1554,7 +1616,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 		case T_NeonErrorResponse:
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
-					 errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+					 errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forkNum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
@@ -1563,7 +1625,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 			break;

 		default:
-			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
 	}
 	pfree(resp);
 	return exists;
@@ -1580,7 +1642,7 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
-			elog(ERROR, "cannot call smgrcreate() on rel with unknown persistence");
+			neon_log(ERROR, "cannot call smgrcreate() on rel with unknown persistence");

 		case RELPERSISTENCE_PERMANENT:
 			break;
@@ -1591,10 +1653,10 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 			return;

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	elog(SmgrTrace, "Create relation %u/%u/%u.%u",
+	neon_log(SmgrTrace, "Create relation %u/%u/%u.%u",
 		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 		 forkNum);

@@ -1689,7 +1751,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
-			elog(ERROR, "cannot call smgrextend() on rel with unknown persistence");
+			neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");

 		case RELPERSISTENCE_PERMANENT:
 			break;
@@ -1700,7 +1762,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			return;

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

 	/*
@@ -1719,7 +1781,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
 			ereport(ERROR,
 					(errcode(ERRCODE_DISK_FULL),
-					 errmsg("could not extend file because project size limit (%d MB) has been exceeded",
+					 errmsg(NEON_TAG "could not extend file because project size limit (%d MB) has been exceeded",
 							max_cluster_size),
 					 errhint("This limit is defined externally by the project size limit, and internally by neon.max_cluster_size GUC")));
 	}
@@ -1738,7 +1800,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1);

 	lsn = PageGetLSN((Page) buffer);
-	elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
+	neon_log(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
 		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 		 forkNum, blkno,
 		 (uint32) (lsn >> 32), (uint32) lsn);
@@ -1778,7 +1840,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
-			elog(ERROR, "cannot call smgrextend() on rel with unknown persistence");
+			neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");

 		case RELPERSISTENCE_PERMANENT:
 			break;
@@ -1789,7 +1851,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 			return;

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

 	if (max_cluster_size > 0 &&
@@ -1801,7 +1863,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 		if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
 			ereport(ERROR,
 					(errcode(ERRCODE_DISK_FULL),
-					 errmsg("could not extend file because cluster size limit (%d MB) has been exceeded",
+					 errmsg(NEON_TAG "could not extend file because cluster size limit (%d MB) has been exceeded",
 							max_cluster_size),
 					 errhint("This limit is defined by neon.max_cluster_size GUC")));
 	}
@@ -1814,7 +1876,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-				 errmsg("cannot extend file \"%s\" beyond %u blocks",
+				 errmsg(NEON_TAG "cannot extend file \"%s\" beyond %u blocks",
 						relpath(reln->smgr_rlocator, forkNum),
 						InvalidBlockNumber)));

@@ -1875,7 +1937,7 @@ neon_open(SMgrRelation reln)
 	mdopen(reln);

 	/* no work */
-	elog(SmgrTrace, "[NEON_SMGR] open noop");
+	neon_log(SmgrTrace, "open noop");
 }

 /*
@@ -1912,7 +1974,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 			return mdprefetch(reln, forknum, blocknum);

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

 	if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
@@ -1957,11 +2019,11 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 			return;

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

 	/* not implemented */
-	elog(SmgrTrace, "[NEON_SMGR] writeback noop");
+	neon_log(SmgrTrace, "writeback noop");

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -2091,8 +2153,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		case T_NeonErrorResponse:
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
-					 errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
-							blkno,
+					 errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
+							slot->shard_no, blkno,
 							RelFileInfoFmt(rinfo),
 							forkNum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
@@ -2100,7 +2162,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
 		default:
-			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
 	}

 	/* buffer was used, clean up for later reuse */
@@ -2124,7 +2186,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
-			elog(ERROR, "cannot call smgrread() on rel with unknown persistence");
+			neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");

 		case RELPERSISTENCE_PERMANENT:
 			break;
@@ -2135,7 +2197,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 			return;

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

 	/* Try to read from local file cache */
@@ -2163,7 +2225,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 		{
 			if (!PageIsNew((Page) pageserver_masked))
 			{
-				elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
+				neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
 					 blkno,
 					 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 					 forkNum,
@@ -2173,7 +2235,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 		}
 		else if (PageIsNew((Page) buffer))
 		{
-			elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
+			neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
 				 blkno,
 				 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 				 forkNum,
@@ -2188,7 +2250,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer

 			if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
 			{
-				elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
+				neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
 					 blkno,
 					 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 					 forkNum,
@@ -2207,7 +2269,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer

 				if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
 				{
-					elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
+					neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
 						 blkno,
 						 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 						 forkNum,
@@ -2287,13 +2349,13 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 			return;

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

 	neon_wallog_page(reln, forknum, blocknum, buffer, false);

 	lsn = PageGetLSN((Page) buffer);
-	elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
+	neon_log(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
 		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 		 forknum, blocknum,
 		 (uint32) (lsn >> 32), (uint32) lsn);
@@ -2320,7 +2382,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
-			elog(ERROR, "cannot call smgrnblocks() on rel with unknown persistence");
+			neon_log(ERROR, "cannot call smgrnblocks() on rel with unknown persistence");
 			break;

 		case RELPERSISTENCE_PERMANENT:
@@ -2331,12 +2393,12 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			return mdnblocks(reln, forknum);

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

 	if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
 	{
-		elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
+		neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
 			 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 			 forknum, n_blocks);
 		return n_blocks;
@@ -2364,7 +2426,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 		case T_NeonErrorResponse:
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
-					 errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+					 errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
@@ -2373,11 +2435,11 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			break;

 		default:
-			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
 	}
 	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);

-	elog(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
+	neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
 		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 		 forknum,
 		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
@@ -2420,7 +2482,7 @@ neon_dbsize(Oid dbNode)
 		case T_NeonErrorResponse:
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
-					 errmsg("could not read db size of db %u from page server at lsn %X/%08X",
+					 errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X",
 							dbNode,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
 					 errdetail("page server returned error: %s",
@@ -2428,10 +2490,10 @@ neon_dbsize(Oid dbNode)
 			break;

 		default:
-			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
 	}

-	elog(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
+	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
 		 dbNode,
 		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
 		 db_size);
@@ -2451,7 +2513,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
-			elog(ERROR, "cannot call smgrtruncate() on rel with unknown persistence");
+			neon_log(ERROR, "cannot call smgrtruncate() on rel with unknown persistence");
 			break;

 		case RELPERSISTENCE_PERMANENT:
@@ -2463,7 +2525,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 			return;

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

 	set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);
@@ -2519,7 +2581,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
-			elog(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence");
+			neon_log(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence");
 			break;

 		case RELPERSISTENCE_PERMANENT:
@@ -2531,10 +2593,10 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
 			return;

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	elog(SmgrTrace, "[NEON_SMGR] immedsync noop");
+	neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -2559,17 +2621,17 @@ neon_start_unlogged_build(SMgrRelation reln)
 	 * progress at a time. That's enough for the current usage.
 	 */
 	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
-		elog(ERROR, "unlogged relation build is already in progress");
+		neon_log(ERROR, "unlogged relation build is already in progress");
 	Assert(unlogged_build_rel == NULL);

 	ereport(SmgrTrace,
-			(errmsg("starting unlogged build of relation %u/%u/%u",
+			(errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u",
 					RelFileInfoFmt(InfoFromSMgrRel(reln)))));

 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
-			elog(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
+			neon_log(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
 			break;

 		case RELPERSISTENCE_PERMANENT:
@@ -2582,11 +2644,11 @@ neon_start_unlogged_build(SMgrRelation reln)
 			return;

 		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

 	if (smgrnblocks(reln, MAIN_FORKNUM) != 0)
-		elog(ERROR, "cannot perform unlogged index build, index is not empty ");
+		neon_log(ERROR, "cannot perform unlogged index build, index is not empty ");

 	unlogged_build_rel = reln;
 	unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
@@ -2613,7 +2675,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 	Assert(unlogged_build_rel == reln);

 	ereport(SmgrTrace,
-			(errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u",
+			(errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u",
 					RelFileInfoFmt(InfoFromSMgrRel(reln)))));

 	if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
@@ -2642,7 +2704,7 @@ neon_end_unlogged_build(SMgrRelation reln)
 	Assert(unlogged_build_rel == reln);

 	ereport(SmgrTrace,
-			(errmsg("ending unlogged build of relation %u/%u/%u",
+			(errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u",
 					RelFileInfoFmt(InfoFromNInfoB(rinfob)))));

 	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
@@ -2657,7 +2719,7 @@ neon_end_unlogged_build(SMgrRelation reln)
 		rinfob = InfoBFromSMgrRel(reln);
 		for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
 		{
-			elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
+			neon_log(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
 				 RelFileInfoFmt(InfoFromNInfoB(rinfob)),
 				 forknum);

@@ -2700,7 +2762,7 @@ AtEOXact_neon(XactEvent event, void *arg)
 				unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 				ereport(ERROR,
 						(errcode(ERRCODE_INTERNAL_ERROR),
-						 (errmsg("unlogged index build was not properly finished"))));
+						 (errmsg(NEON_TAG "unlogged index build was not properly finished"))));
 			}
 			break;
 	}
@@ -2799,14 +2861,14 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		set_cached_relsize(rinfo, forknum, relsize);
 		SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);

-		elog(SmgrTrace, "Set length to %d", relsize);
+		neon_log(SmgrTrace, "Set length to %d", relsize);
 	}
 }

 #define FSM_TREE_DEPTH	((SlotsPerFSMPage >= 1626) ? 3 : 4)

 /*
- * TODO: May be it is better to make correspondent fgunctio from freespace.c public?
+ * TODO: May be it is better to make correspondent function from freespace.c public?
 */
 static BlockNumber
 get_fsm_physical_block(BlockNumber heapblk)
@@ -2869,7 +2931,7 @@ get_fsm_physical_block(BlockNumber heapblk)
 * contents, where with REDO locking it would wait on block 1 and see
 * block 3 with post-REDO contents only.
 */
-bool
+static bool
 neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 {
 	XLogRecPtr	end_recptr = record->EndRecPtr;
@@ -2887,7 +2949,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)

 #if PG_VERSION_NUM < 150000
 	if (!XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno))
-		elog(PANIC, "failed to locate backup block with ID %d", block_id);
+		neon_log(PANIC, "failed to locate backup block with ID %d", block_id);
 #else
 	XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno);
 #endif
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -35,6 +35,8 @@
 *
 *-------------------------------------------------------------------------
 */
+#include <sys/resource.h>
+
 #include "postgres.h"
 #include "libpq/pqformat.h"
 #include "neon.h"
@@ -1069,6 +1071,12 @@ DetermineEpochStartLsn(WalProposer *wp)
 			if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
 											walprop_shared->mineLastElectedTerm)))
 			{
+				/*
+				 * Panic to restart PG as we need to retake basebackup.
+				 * However, don't dump core as this is kinda expected
+				 * scenario.
+				 */
+				disable_core_dump();
 				walprop_log(PANIC,
 							"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
 							LSN_FORMAT_ARGS(wp->propEpochStartLsn),
@@ -1445,7 +1453,12 @@ RecvAppendResponses(Safekeeper *sk)

 		if (sk->appendResponse.term > wp->propTerm)
 		{
-			/* Another compute with higher term is running. */
+			/*
+			 * Another compute with higher term is running. Panic to restart
+			 * PG as we likely need to retake basebackup. However, don't dump
+			 * core as this is kinda expected scenario.
+			 */
+			disable_core_dump();
 			walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
 						sk->host, sk->port,
 						sk->appendResponse.term, wp->propTerm);
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -1,14 +1,12 @@
 #ifndef __NEON_WALPROPOSER_H__
 #define __NEON_WALPROPOSER_H__

-#include "postgres.h"
-#include "access/xlogdefs.h"
-#include "port.h"
-#include "access/xlog_internal.h"
 #include "access/transam.h"
+#include "access/xlogdefs.h"
+#include "access/xlog_internal.h"
 #include "nodes/replnodes.h"
-#include "utils/uuid.h"
 #include "replication/walreceiver.h"
+#include "utils/uuid.h"

 #define SK_MAGIC 0xCafeCeefu
 #define SK_PROTOCOL_VERSION 2
--- a/pgxn/neon/walproposer_compat.c
+++ b/pgxn/neon/walproposer_compat.c
@@ -3,11 +3,13 @@
 * This is needed to avoid linking to full postgres server installation. This file
 * is compiled as a part of libwalproposer static library.
 */
+#include "postgres.h"

 #include <stdio.h>
-#include "walproposer.h"
-#include "utils/datetime.h"
+
 #include "miscadmin.h"
+#include "utils/datetime.h"
+#include "walproposer.h"

 void
 ExceptionalCondition(const char *conditionName,
--- a/poetry.lock
+++ b/poetry.lock
@@ -2092,51 +2092,61 @@ files = [

 [[package]]
 name = "pyyaml"
-version = "6.0"
+version = "6.0.1"
 description = "YAML parser and emitter for Python"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"},
-    {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"},
-    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"},
-    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"},
-    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
-    {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
-    {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
-    {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
-    {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
-    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
-    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
-    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
-    {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
-    {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
-    {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
-    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
-    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
-    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"},
-    {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"},
-    {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"},
-    {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"},
-    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"},
-    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"},
-    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"},
-    {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"},
-    {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"},
-    {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"},
-    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"},
-    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"},
-    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"},
-    {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"},
-    {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"},
-    {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"},
-    {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"},
-    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"},
-    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"},
-    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"},
-    {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"},
-    {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"},
-    {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"},
+    {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"},
+    {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"},
+    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
+    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
+    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
+    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
+    {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
+    {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
+    {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
+    {file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"},
+    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
+    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
+    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
+    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
+    {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
+    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
+    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"},
+    {file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"},
+    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
+    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
+    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
+    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
+    {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
+    {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
+    {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
+    {file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"},
+    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
+    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
+    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
+    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
+    {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
+    {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
+    {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
 ]

 [[package]]
@@ -2553,85 +2563,101 @@ files = [

 [[package]]
 name = "yarl"
-version = "1.8.2"
+version = "1.9.4"
 description = "Yet another URL library"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:bb81f753c815f6b8e2ddd2eef3c855cf7da193b82396ac013c661aaa6cc6b0a5"},
-    {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:47d49ac96156f0928f002e2424299b2c91d9db73e08c4cd6742923a086f1c863"},
-    {file = "yarl-1.8.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3fc056e35fa6fba63248d93ff6e672c096f95f7836938241ebc8260e062832fe"},
-    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58a3c13d1c3005dbbac5c9f0d3210b60220a65a999b1833aa46bd6677c69b08e"},
-    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10b08293cda921157f1e7c2790999d903b3fd28cd5c208cf8826b3b508026996"},
-    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de986979bbd87272fe557e0a8fcb66fd40ae2ddfe28a8b1ce4eae22681728fef"},
-    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c4fcfa71e2c6a3cb568cf81aadc12768b9995323186a10827beccf5fa23d4f8"},
-    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae4d7ff1049f36accde9e1ef7301912a751e5bae0a9d142459646114c70ecba6"},
-    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bf071f797aec5b96abfc735ab97da9fd8f8768b43ce2abd85356a3127909d146"},
-    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:74dece2bfc60f0f70907c34b857ee98f2c6dd0f75185db133770cd67300d505f"},
-    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:df60a94d332158b444301c7f569659c926168e4d4aad2cfbf4bce0e8fb8be826"},
-    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:63243b21c6e28ec2375f932a10ce7eda65139b5b854c0f6b82ed945ba526bff3"},
-    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cfa2bbca929aa742b5084fd4663dd4b87c191c844326fcb21c3afd2d11497f80"},
-    {file = "yarl-1.8.2-cp310-cp310-win32.whl", hash = "sha256:b05df9ea7496df11b710081bd90ecc3a3db6adb4fee36f6a411e7bc91a18aa42"},
-    {file = "yarl-1.8.2-cp310-cp310-win_amd64.whl", hash = "sha256:24ad1d10c9db1953291f56b5fe76203977f1ed05f82d09ec97acb623a7976574"},
-    {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2a1fca9588f360036242f379bfea2b8b44cae2721859b1c56d033adfd5893634"},
-    {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f37db05c6051eff17bc832914fe46869f8849de5b92dc4a3466cd63095d23dfd"},
-    {file = "yarl-1.8.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:77e913b846a6b9c5f767b14dc1e759e5aff05502fe73079f6f4176359d832581"},
-    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0978f29222e649c351b173da2b9b4665ad1feb8d1daa9d971eb90df08702668a"},
-    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388a45dc77198b2460eac0aca1efd6a7c09e976ee768b0d5109173e521a19daf"},
-    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2305517e332a862ef75be8fad3606ea10108662bc6fe08509d5ca99503ac2aee"},
-    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42430ff511571940d51e75cf42f1e4dbdded477e71c1b7a17f4da76c1da8ea76"},
-    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3150078118f62371375e1e69b13b48288e44f6691c1069340081c3fd12c94d5b"},
-    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c15163b6125db87c8f53c98baa5e785782078fbd2dbeaa04c6141935eb6dab7a"},
-    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d04acba75c72e6eb90745447d69f84e6c9056390f7a9724605ca9c56b4afcc6"},
-    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e7fd20d6576c10306dea2d6a5765f46f0ac5d6f53436217913e952d19237efc4"},
-    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:75c16b2a900b3536dfc7014905a128a2bea8fb01f9ee26d2d7d8db0a08e7cb2c"},
-    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6d88056a04860a98341a0cf53e950e3ac9f4e51d1b6f61a53b0609df342cc8b2"},
-    {file = "yarl-1.8.2-cp311-cp311-win32.whl", hash = "sha256:fb742dcdd5eec9f26b61224c23baea46c9055cf16f62475e11b9b15dfd5c117b"},
-    {file = "yarl-1.8.2-cp311-cp311-win_amd64.whl", hash = "sha256:8c46d3d89902c393a1d1e243ac847e0442d0196bbd81aecc94fcebbc2fd5857c"},
-    {file = "yarl-1.8.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:ceff9722e0df2e0a9e8a79c610842004fa54e5b309fe6d218e47cd52f791d7ef"},
-    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f6b4aca43b602ba0f1459de647af954769919c4714706be36af670a5f44c9c1"},
-    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1684a9bd9077e922300ecd48003ddae7a7474e0412bea38d4631443a91d61077"},
-    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ebb78745273e51b9832ef90c0898501006670d6e059f2cdb0e999494eb1450c2"},
-    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3adeef150d528ded2a8e734ebf9ae2e658f4c49bf413f5f157a470e17a4a2e89"},
-    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57a7c87927a468e5a1dc60c17caf9597161d66457a34273ab1760219953f7f4c"},
-    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:efff27bd8cbe1f9bd127e7894942ccc20c857aa8b5a0327874f30201e5ce83d0"},
-    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:a783cd344113cb88c5ff7ca32f1f16532a6f2142185147822187913eb989f739"},
-    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:705227dccbe96ab02c7cb2c43e1228e2826e7ead880bb19ec94ef279e9555b5b"},
-    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:34c09b43bd538bf6c4b891ecce94b6fa4f1f10663a8d4ca589a079a5018f6ed7"},
-    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a48f4f7fea9a51098b02209d90297ac324241bf37ff6be6d2b0149ab2bd51b37"},
-    {file = "yarl-1.8.2-cp37-cp37m-win32.whl", hash = "sha256:0414fd91ce0b763d4eadb4456795b307a71524dbacd015c657bb2a39db2eab89"},
-    {file = "yarl-1.8.2-cp37-cp37m-win_amd64.whl", hash = "sha256:d881d152ae0007809c2c02e22aa534e702f12071e6b285e90945aa3c376463c5"},
-    {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5df5e3d04101c1e5c3b1d69710b0574171cc02fddc4b23d1b2813e75f35a30b1"},
-    {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7a66c506ec67eb3159eea5096acd05f5e788ceec7b96087d30c7d2865a243918"},
-    {file = "yarl-1.8.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2b4fa2606adf392051d990c3b3877d768771adc3faf2e117b9de7eb977741229"},
-    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e21fb44e1eff06dd6ef971d4bdc611807d6bd3691223d9c01a18cec3677939e"},
-    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93202666046d9edadfe9f2e7bf5e0782ea0d497b6d63da322e541665d65a044e"},
-    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc77086ce244453e074e445104f0ecb27530d6fd3a46698e33f6c38951d5a0f1"},
-    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dd68a92cab699a233641f5929a40f02a4ede8c009068ca8aa1fe87b8c20ae3"},
-    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1b372aad2b5f81db66ee7ec085cbad72c4da660d994e8e590c997e9b01e44901"},
-    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e6f3515aafe0209dd17fb9bdd3b4e892963370b3de781f53e1746a521fb39fc0"},
-    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dfef7350ee369197106805e193d420b75467b6cceac646ea5ed3049fcc950a05"},
-    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:728be34f70a190566d20aa13dc1f01dc44b6aa74580e10a3fb159691bc76909d"},
-    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:ff205b58dc2929191f68162633d5e10e8044398d7a45265f90a0f1d51f85f72c"},
-    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:baf211dcad448a87a0d9047dc8282d7de59473ade7d7fdf22150b1d23859f946"},
-    {file = "yarl-1.8.2-cp38-cp38-win32.whl", hash = "sha256:272b4f1599f1b621bf2aabe4e5b54f39a933971f4e7c9aa311d6d7dc06965165"},
-    {file = "yarl-1.8.2-cp38-cp38-win_amd64.whl", hash = "sha256:326dd1d3caf910cd26a26ccbfb84c03b608ba32499b5d6eeb09252c920bcbe4f"},
-    {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f8ca8ad414c85bbc50f49c0a106f951613dfa5f948ab69c10ce9b128d368baf8"},
-    {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:418857f837347e8aaef682679f41e36c24250097f9e2f315d39bae3a99a34cbf"},
-    {file = "yarl-1.8.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0eec05ab49e91a78700761777f284c2df119376e391db42c38ab46fd662b77"},
-    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:009a028127e0a1755c38b03244c0bea9d5565630db9c4cf9572496e947137a87"},
-    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3edac5d74bb3209c418805bda77f973117836e1de7c000e9755e572c1f7850d0"},
-    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:da65c3f263729e47351261351b8679c6429151ef9649bba08ef2528ff2c423b2"},
-    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ef8fb25e52663a1c85d608f6dd72e19bd390e2ecaf29c17fb08f730226e3a08"},
-    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bcd7bb1e5c45274af9a1dd7494d3c52b2be5e6bd8d7e49c612705fd45420b12d"},
-    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:44ceac0450e648de86da8e42674f9b7077d763ea80c8ceb9d1c3e41f0f0a9951"},
-    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:97209cc91189b48e7cfe777237c04af8e7cc51eb369004e061809bcdf4e55220"},
-    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:48dd18adcf98ea9cd721a25313aef49d70d413a999d7d89df44f469edfb38a06"},
-    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e59399dda559688461762800d7fb34d9e8a6a7444fd76ec33220a926c8be1516"},
-    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d617c241c8c3ad5c4e78a08429fa49e4b04bedfc507b34b4d8dceb83b4af3588"},
-    {file = "yarl-1.8.2-cp39-cp39-win32.whl", hash = "sha256:cb6d48d80a41f68de41212f3dfd1a9d9898d7841c8f7ce6696cf2fd9cb57ef83"},
-    {file = "yarl-1.8.2-cp39-cp39-win_amd64.whl", hash = "sha256:6604711362f2dbf7160df21c416f81fac0de6dbcf0b5445a2ef25478ecc4c778"},
-    {file = "yarl-1.8.2.tar.gz", hash = "sha256:49d43402c6e3013ad0978602bf6bf5328535c48d192304b91b97a3c6790b1562"},
+    {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a8c1df72eb746f4136fe9a2e72b0c9dc1da1cbd23b5372f94b5820ff8ae30e0e"},
+    {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a3a6ed1d525bfb91b3fc9b690c5a21bb52de28c018530ad85093cc488bee2dd2"},
+    {file = "yarl-1.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c38c9ddb6103ceae4e4498f9c08fac9b590c5c71b0370f98714768e22ac6fa66"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9e09c9d74f4566e905a0b8fa668c58109f7624db96a2171f21747abc7524234"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8477c1ee4bd47c57d49621a062121c3023609f7a13b8a46953eb6c9716ca392"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5ff2c858f5f6a42c2a8e751100f237c5e869cbde669a724f2062d4c4ef93551"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:357495293086c5b6d34ca9616a43d329317feab7917518bc97a08f9e55648455"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54525ae423d7b7a8ee81ba189f131054defdb122cde31ff17477951464c1691c"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:801e9264d19643548651b9db361ce3287176671fb0117f96b5ac0ee1c3530d53"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e516dc8baf7b380e6c1c26792610230f37147bb754d6426462ab115a02944385"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:7d5aaac37d19b2904bb9dfe12cdb08c8443e7ba7d2852894ad448d4b8f442863"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:54beabb809ffcacbd9d28ac57b0db46e42a6e341a030293fb3185c409e626b8b"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bac8d525a8dbc2a1507ec731d2867025d11ceadcb4dd421423a5d42c56818541"},
+    {file = "yarl-1.9.4-cp310-cp310-win32.whl", hash = "sha256:7855426dfbddac81896b6e533ebefc0af2f132d4a47340cee6d22cac7190022d"},
+    {file = "yarl-1.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:848cd2a1df56ddbffeb375535fb62c9d1645dde33ca4d51341378b3f5954429b"},
+    {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:35a2b9396879ce32754bd457d31a51ff0a9d426fd9e0e3c33394bf4b9036b099"},
+    {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c7d56b293cc071e82532f70adcbd8b61909eec973ae9d2d1f9b233f3d943f2c"},
+    {file = "yarl-1.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d8a1c6c0be645c745a081c192e747c5de06e944a0d21245f4cf7c05e457c36e0"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b3c1ffe10069f655ea2d731808e76e0f452fc6c749bea04781daf18e6039525"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:549d19c84c55d11687ddbd47eeb348a89df9cb30e1993f1b128f4685cd0ebbf8"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7409f968456111140c1c95301cadf071bd30a81cbd7ab829169fb9e3d72eae9"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e23a6d84d9d1738dbc6e38167776107e63307dfc8ad108e580548d1f2c587f42"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d8b889777de69897406c9fb0b76cdf2fd0f31267861ae7501d93003d55f54fbe"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:03caa9507d3d3c83bca08650678e25364e1843b484f19986a527630ca376ecce"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4e9035df8d0880b2f1c7f5031f33f69e071dfe72ee9310cfc76f7b605958ceb9"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:c0ec0ed476f77db9fb29bca17f0a8fcc7bc97ad4c6c1d8959c507decb22e8572"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:ee04010f26d5102399bd17f8df8bc38dc7ccd7701dc77f4a68c5b8d733406958"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49a180c2e0743d5d6e0b4d1a9e5f633c62eca3f8a86ba5dd3c471060e352ca98"},
+    {file = "yarl-1.9.4-cp311-cp311-win32.whl", hash = "sha256:81eb57278deb6098a5b62e88ad8281b2ba09f2f1147c4767522353eaa6260b31"},
+    {file = "yarl-1.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d1d2532b340b692880261c15aee4dc94dd22ca5d61b9db9a8a361953d36410b1"},
+    {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0d2454f0aef65ea81037759be5ca9947539667eecebca092733b2eb43c965a81"},
+    {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:44d8ffbb9c06e5a7f529f38f53eda23e50d1ed33c6c869e01481d3fafa6b8142"},
+    {file = "yarl-1.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aaaea1e536f98754a6e5c56091baa1b6ce2f2700cc4a00b0d49eca8dea471074"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3777ce5536d17989c91696db1d459574e9a9bd37660ea7ee4d3344579bb6f129"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fc5fc1eeb029757349ad26bbc5880557389a03fa6ada41703db5e068881e5f2"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ea65804b5dc88dacd4a40279af0cdadcfe74b3e5b4c897aa0d81cf86927fee78"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa102d6d280a5455ad6a0f9e6d769989638718e938a6a0a2ff3f4a7ff8c62cc4"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09efe4615ada057ba2d30df871d2f668af661e971dfeedf0c159927d48bbeff0"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:008d3e808d03ef28542372d01057fd09168419cdc8f848efe2804f894ae03e51"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6f5cb257bc2ec58f437da2b37a8cd48f666db96d47b8a3115c29f316313654ff"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:992f18e0ea248ee03b5a6e8b3b4738850ae7dbb172cc41c966462801cbf62cf7"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:0e9d124c191d5b881060a9e5060627694c3bdd1fe24c5eecc8d5d7d0eb6faabc"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3986b6f41ad22988e53d5778f91855dc0399b043fc8946d4f2e68af22ee9ff10"},
+    {file = "yarl-1.9.4-cp312-cp312-win32.whl", hash = "sha256:4b21516d181cd77ebd06ce160ef8cc2a5e9ad35fb1c5930882baff5ac865eee7"},
+    {file = "yarl-1.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:a9bd00dc3bc395a662900f33f74feb3e757429e545d831eef5bb280252631984"},
+    {file = "yarl-1.9.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:63b20738b5aac74e239622d2fe30df4fca4942a86e31bf47a81a0e94c14df94f"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d7f7de27b8944f1fee2c26a88b4dabc2409d2fea7a9ed3df79b67277644e17"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c74018551e31269d56fab81a728f683667e7c28c04e807ba08f8c9e3bba32f14"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ca06675212f94e7a610e85ca36948bb8fc023e458dd6c63ef71abfd482481aa5"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aef935237d60a51a62b86249839b51345f47564208c6ee615ed2a40878dccdd"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b134fd795e2322b7684155b7855cc99409d10b2e408056db2b93b51a52accc7"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d25039a474c4c72a5ad4b52495056f843a7ff07b632c1b92ea9043a3d9950f6e"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f7d6b36dd2e029b6bcb8a13cf19664c7b8e19ab3a58e0fefbb5b8461447ed5ec"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:957b4774373cf6f709359e5c8c4a0af9f6d7875db657adb0feaf8d6cb3c3964c"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d7eeb6d22331e2fd42fce928a81c697c9ee2d51400bd1a28803965883e13cead"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6a962e04b8f91f8c4e5917e518d17958e3bdee71fd1d8b88cdce74dd0ebbf434"},
+    {file = "yarl-1.9.4-cp37-cp37m-win32.whl", hash = "sha256:f3bc6af6e2b8f92eced34ef6a96ffb248e863af20ef4fde9448cc8c9b858b749"},
+    {file = "yarl-1.9.4-cp37-cp37m-win_amd64.whl", hash = "sha256:ad4d7a90a92e528aadf4965d685c17dacff3df282db1121136c382dc0b6014d2"},
+    {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ec61d826d80fc293ed46c9dd26995921e3a82146feacd952ef0757236fc137be"},
+    {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8be9e837ea9113676e5754b43b940b50cce76d9ed7d2461df1af39a8ee674d9f"},
+    {file = "yarl-1.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bef596fdaa8f26e3d66af846bbe77057237cb6e8efff8cd7cc8dff9a62278bbf"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d47552b6e52c3319fede1b60b3de120fe83bde9b7bddad11a69fb0af7db32f1"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84fc30f71689d7fc9168b92788abc977dc8cefa806909565fc2951d02f6b7d57"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4aa9741085f635934f3a2583e16fcf62ba835719a8b2b28fb2917bb0537c1dfa"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:206a55215e6d05dbc6c98ce598a59e6fbd0c493e2de4ea6cc2f4934d5a18d130"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07574b007ee20e5c375a8fe4a0789fad26db905f9813be0f9fef5a68080de559"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5a2e2433eb9344a163aced6a5f6c9222c0786e5a9e9cac2c89f0b28433f56e23"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:6ad6d10ed9b67a382b45f29ea028f92d25bc0bc1daf6c5b801b90b5aa70fb9ec"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:6fe79f998a4052d79e1c30eeb7d6c1c1056ad33300f682465e1b4e9b5a188b78"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a825ec844298c791fd28ed14ed1bffc56a98d15b8c58a20e0e08c1f5f2bea1be"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8619d6915b3b0b34420cf9b2bb6d81ef59d984cb0fde7544e9ece32b4b3043c3"},
+    {file = "yarl-1.9.4-cp38-cp38-win32.whl", hash = "sha256:686a0c2f85f83463272ddffd4deb5e591c98aac1897d65e92319f729c320eece"},
+    {file = "yarl-1.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:a00862fb23195b6b8322f7d781b0dc1d82cb3bcac346d1e38689370cc1cc398b"},
+    {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:604f31d97fa493083ea21bd9b92c419012531c4e17ea6da0f65cacdcf5d0bd27"},
+    {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8a854227cf581330ffa2c4824d96e52ee621dd571078a252c25e3a3b3d94a1b1"},
+    {file = "yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ba6f52cbc7809cd8d74604cce9c14868306ae4aa0282016b641c661f981a6e91"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6327976c7c2f4ee6816eff196e25385ccc02cb81427952414a64811037bbc8b"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8397a3817d7dcdd14bb266283cd1d6fc7264a48c186b986f32e86d86d35fbac5"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e0381b4ce23ff92f8170080c97678040fc5b08da85e9e292292aba67fdac6c34"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23d32a2594cb5d565d358a92e151315d1b2268bc10f4610d098f96b147370136"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ddb2a5c08a4eaaba605340fdee8fc08e406c56617566d9643ad8bf6852778fc7"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:26a1dc6285e03f3cc9e839a2da83bcbf31dcb0d004c72d0730e755b33466c30e"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:18580f672e44ce1238b82f7fb87d727c4a131f3a9d33a5e0e82b793362bf18b4"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:29e0f83f37610f173eb7e7b5562dd71467993495e568e708d99e9d1944f561ec"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:1f23e4fe1e8794f74b6027d7cf19dc25f8b63af1483d91d595d4a07eca1fb26c"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:db8e58b9d79200c76956cefd14d5c90af54416ff5353c5bfd7cbe58818e26ef0"},
+    {file = "yarl-1.9.4-cp39-cp39-win32.whl", hash = "sha256:c7224cab95645c7ab53791022ae77a4509472613e839dab722a72abe5a684575"},
+    {file = "yarl-1.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:824d6c50492add5da9374875ce72db7a0733b29c2394890aef23d533106e2b15"},
+    {file = "yarl-1.9.4-py3-none-any.whl", hash = "sha256:928cecb0ef9d5a7946eb6ff58417ad2fe9375762382f1bf5c55e61645f2c43ad"},
+    {file = "yarl-1.9.4.tar.gz", hash = "sha256:566db86717cf8080b99b58b083b773a908ae40f06681e87e589a976faf8246bf"},
 ]

 [package.dependencies]
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -87,6 +87,10 @@ impl AuthError {
    pub fn too_many_connections() -> Self {
        AuthErrorImpl::TooManyConnections.into()
    }
+
+    pub fn is_auth_failed(&self) -> bool {
+        matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_))
+    }
 }

 impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -9,9 +9,9 @@ use tokio_postgres::config::AuthKeys;
 use crate::auth::credentials::check_peer_addr_is_in_list;
 use crate::auth::validate_password_and_exchange;
 use crate::console::errors::GetAuthInfoError;
-use crate::console::provider::AuthInfo;
 use crate::console::AuthSecret;
-use crate::proxy::{handle_try_wake, retry_after, LatencyTimer};
+use crate::proxy::connect_compute::handle_try_wake;
+use crate::proxy::retry::retry_after;
 use crate::scram;
 use crate::stream::Stream;
 use crate::{
@@ -22,6 +22,7 @@ use crate::{
        provider::{CachedNodeInfo, ConsoleReqExtra},
        Api,
    },
+    metrics::LatencyTimer,
    stream, url,
 };
 use futures::TryFutureExt;
@@ -185,24 +186,52 @@ async fn auth_quirks(
    };

    info!("fetching user's authentication info");
-    // TODO(anna): this will slow down both "hacks" below; we probably need a cache.
-    let AuthInfo {
-        secret,
-        allowed_ips,
-    } = api.get_auth_info(extra, &info).await?;
+    let allowed_ips = api.get_allowed_ips(extra, &info).await?;

    // check allowed list
    if !check_peer_addr_is_in_list(&info.inner.peer_addr, &allowed_ips) {
        return Err(auth::AuthError::ip_address_not_allowed());
    }
-    let secret = secret.unwrap_or_else(|| {
+    let cached_secret = api.get_role_secret(extra, &info).await?;
+
+    let secret = cached_secret.clone().unwrap_or_else(|| {
        // If we don't have an authentication secret, we mock one to
        // prevent malicious probing (possible due to missing protocol steps).
        // This mocked secret will never lead to successful authentication.
        info!("authentication info not found, mocking it");
        AuthSecret::Scram(scram::ServerSecret::mock(&info.inner.user, rand::random()))
    });
+    match authenticate_with_secret(
+        secret,
+        info,
+        client,
+        unauthenticated_password,
+        allow_cleartext,
+        config,
+        latency_timer,
+    )
+    .await
+    {
+        Ok(keys) => Ok(keys),
+        Err(e) => {
+            if e.is_auth_failed() {
+                // The password could have been changed, so we invalidate the cache.
+                cached_secret.invalidate();
+            }
+            Err(e)
+        }
+    }
+}

+async fn authenticate_with_secret(
+    secret: AuthSecret,
+    info: ComputeUserInfo,
+    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
+    unauthenticated_password: Option<Vec<u8>>,
+    allow_cleartext: bool,
+    config: &'static AuthenticationConfig,
+    latency_timer: &mut LatencyTimer,
+) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
    if let Some(password) = unauthenticated_password {
        let auth_outcome = validate_password_and_exchange(&password, secret)?;
        let keys = match auth_outcome {
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -4,7 +4,7 @@ use crate::{
    compute,
    config::AuthenticationConfig,
    console::AuthSecret,
-    proxy::LatencyTimer,
+    metrics::LatencyTimer,
    sasl,
    stream::{PqStream, Stream},
 };
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -4,7 +4,7 @@ use super::{
 use crate::{
    auth::{self, AuthFlow},
    console::AuthSecret,
-    proxy::LatencyTimer,
+    metrics::LatencyTimer,
    sasl,
    stream::{self, Stream},
 };
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,9 +1,8 @@
 //! User credentials used in authentication.

 use crate::{
-    auth::password_hack::parse_endpoint_param,
-    error::UserFacingError,
-    proxy::{neon_options_str, NUM_CONNECTION_ACCEPTED_BY_SNI},
+    auth::password_hack::parse_endpoint_param, error::UserFacingError,
+    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::neon_options_str,
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -6,7 +6,10 @@ use proxy::config::HttpConfig;
 use proxy::console;
 use proxy::console::provider::AllowedIpsCache;
 use proxy::console::provider::NodeInfoCache;
+use proxy::console::provider::RoleSecretCache;
 use proxy::http;
+use proxy::rate_limiter::EndpointRateLimiter;
+use proxy::rate_limiter::RateBucketInfo;
 use proxy::rate_limiter::RateLimiterConfig;
 use proxy::usage_metrics;

@@ -14,6 +17,7 @@ use anyhow::bail;
 use proxy::config::{self, ProxyConfig};
 use proxy::serverless;
 use std::pin::pin;
+use std::sync::Arc;
 use std::{borrow::Cow, net::SocketAddr};
 use tokio::net::TcpListener;
 use tokio::task::JoinSet;
@@ -83,7 +87,7 @@ struct ProxyCliArgs {
    #[clap(long)]
    metric_collection_interval: Option<String>,
    /// cache for `wake_compute` api method (use `size=0` to disable)
-    #[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)]
+    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
    wake_compute_cache: String,
    /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
    #[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
@@ -113,16 +117,22 @@ struct ProxyCliArgs {
    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
    rate_limiter_timeout: tokio::time::Duration,
    /// Endpoint rate limiter max number of requests per second.
-    #[clap(long, default_value_t = 300)]
-    endpoint_rps_limit: u32,
+    ///
+    /// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
+    /// Can be given multiple times for different bucket sizes.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
+    endpoint_rps_limit: Vec<RateBucketInfo>,
    /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
    #[clap(long, default_value_t = 100)]
    initial_limit: usize,
    #[clap(flatten)]
    aimd_config: proxy::rate_limiter::AimdConfig,
    /// cache for `allowed_ips` (use `size=0` to disable)
-    #[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)]
+    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
    allowed_ips_cache: String,
+    /// cache for `role_secret` (use `size=0` to disable)
+    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
+    role_secret_cache: String,
    /// disable ip check for http requests. If it is too time consuming, it could be turned off.
    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    disable_ip_check_for_http: bool,
@@ -157,6 +167,8 @@ async fn main() -> anyhow::Result<()> {
    let proxy_listener = TcpListener::bind(proxy_address).await?;
    let cancellation_token = CancellationToken::new();

+    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
+
    // client facing tasks. these will exit on error or on cancellation
    // cancellation returns Ok(())
    let mut client_tasks = JoinSet::new();
@@ -164,6 +176,7 @@ async fn main() -> anyhow::Result<()> {
        config,
        proxy_listener,
        cancellation_token.clone(),
+        endpoint_rate_limiter.clone(),
    ));

    // TODO: rename the argument to something like serverless.
@@ -177,6 +190,7 @@ async fn main() -> anyhow::Result<()> {
            config,
            serverless_listener,
            cancellation_token.clone(),
+            endpoint_rate_limiter.clone(),
        ));
    }

@@ -256,9 +270,11 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        AuthBackend::Console => {
            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
            let allowed_ips_cache_config: CacheOptions = args.allowed_ips_cache.parse()?;
+            let role_secret_cache_config: CacheOptions = args.role_secret_cache.parse()?;

            info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
            info!("Using AllowedIpsCache (wake_compute) with options={allowed_ips_cache_config:?}");
+            info!("Using RoleSecretCache (wake_compute) with options={role_secret_cache_config:?}");
            let caches = Box::leak(Box::new(console::caches::ApiCaches {
                node_info: NodeInfoCache::new(
                    "node_info_cache",
@@ -272,6 +288,12 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
                    allowed_ips_cache_config.ttl,
                    false,
                ),
+                role_secret: RoleSecretCache::new(
+                    "role_secret_cache",
+                    role_secret_cache_config.size,
+                    role_secret_cache_config.ttl,
+                    false,
+                ),
            }));

            let config::WakeComputeLockOptions {
@@ -311,6 +333,10 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
    let authentication_config = AuthenticationConfig {
        scram_protocol_timeout: args.scram_protocol_timeout,
    };
+
+    let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
+    RateBucketInfo::validate(&mut endpoint_rps_limit)?;
+
    let config = Box::leak(Box::new(ProxyConfig {
        tls_config,
        auth_backend,
@@ -320,8 +346,35 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        authentication_config,
        require_client_ip: args.require_client_ip,
        disable_ip_check_for_http: args.disable_ip_check_for_http,
-        endpoint_rps_limit: args.endpoint_rps_limit,
+        endpoint_rps_limit,
    }));

    Ok(config)
 }
+
+#[cfg(test)]
+mod tests {
+    use std::time::Duration;
+
+    use clap::Parser;
+    use proxy::rate_limiter::RateBucketInfo;
+
+    #[test]
+    fn parse_endpoint_rps_limit() {
+        let config = super::ProxyCliArgs::parse_from([
+            "proxy",
+            "--endpoint-rps-limit",
+            "100@1s",
+            "--endpoint-rps-limit",
+            "20@30s",
+        ]);
+
+        assert_eq!(
+            config.endpoint_rps_limit,
+            vec![
+                RateBucketInfo::new(100, Duration::from_secs(1)),
+                RateBucketInfo::new(20, Duration::from_secs(30)),
+            ]
+        );
+    }
+}
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,9 +1,10 @@
 use crate::{
    auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError,
-    error::UserFacingError, proxy::neon_option,
+    error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE, proxy::neon_option,
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
+use metrics::IntCounterPairGuard;
 use pq_proto::StartupMessageParams;
 use std::{io, net::SocketAddr, time::Duration};
 use thiserror::Error;
@@ -223,6 +224,8 @@ pub struct PostgresConnection {
    pub params: std::collections::HashMap<String, String>,
    /// Query cancellation token.
    pub cancel_closure: CancelClosure,
+
+    _guage: IntCounterPairGuard,
 }

 impl ConnCfg {
@@ -231,6 +234,7 @@ impl ConnCfg {
        &self,
        allow_self_signed_compute: bool,
        timeout: Duration,
+        proto: &'static str,
    ) -> Result<PostgresConnection, ConnectionError> {
        let (socket_addr, stream, host) = self.connect_raw(timeout).await?;

@@ -264,6 +268,7 @@ impl ConnCfg {
            stream,
            params,
            cancel_closure,
+            _guage: NUM_DB_CONNECTIONS_GAUGE.with_label_values(&[proto]).guard(),
        };

        Ok(connection)
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,4 +1,4 @@
-use crate::auth;
+use crate::{auth, rate_limiter::RateBucketInfo};
 use anyhow::{bail, ensure, Context, Ok};
 use rustls::{sign, Certificate, PrivateKey};
 use sha2::{Digest, Sha256};
@@ -20,7 +20,7 @@ pub struct ProxyConfig {
    pub authentication_config: AuthenticationConfig,
    pub require_client_ip: bool,
    pub disable_ip_check_for_http: bool,
-    pub endpoint_rps_limit: u32,
+    pub endpoint_rps_limit: Vec<RateBucketInfo>,
 }

 #[derive(Debug)]
@@ -310,10 +310,10 @@ pub struct CacheOptions {

 impl CacheOptions {
    /// Default options for [`crate::console::provider::NodeInfoCache`].
-    pub const DEFAULT_OPTIONS_NODE_INFO: &'static str = "size=4000,ttl=4m";
+    pub const CACHE_DEFAULT_OPTIONS: &'static str = "size=4000,ttl=4m";

    /// Parse cache options passed via cmdline.
-    /// Example: [`Self::DEFAULT_OPTIONS_NODE_INFO`].
+    /// Example: [`Self::CACHE_DEFAULT_OPTIONS`].
    fn parse(options: &str) -> anyhow::Result<Self> {
        let mut size = None;
        let mut ttl = None;
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -10,6 +10,7 @@ use crate::{
 };
 use async_trait::async_trait;
 use dashmap::DashMap;
+use smol_str::SmolStr;
 use std::{sync::Arc, time::Duration};
 use tokio::{
    sync::{OwnedSemaphorePermit, Semaphore},
@@ -21,7 +22,7 @@ pub mod errors {
    use crate::{
        error::{io_error, UserFacingError},
        http,
-        proxy::ShouldRetry,
+        proxy::retry::ShouldRetry,
    };
    use thiserror::Error;

@@ -216,6 +217,7 @@ impl ConsoleReqExtra {
 }

 /// Auth secret which is managed by the cloud.
+#[derive(Clone)]
 pub enum AuthSecret {
    #[cfg(feature = "testing")]
    /// Md5 hash of user's password.
@@ -250,18 +252,20 @@ pub struct NodeInfo {

 pub type NodeInfoCache = TimedLru<Arc<str>, NodeInfo>;
 pub type CachedNodeInfo = timed_lru::Cached<&'static NodeInfoCache>;
-pub type AllowedIpsCache = TimedLru<Arc<str>, Arc<Vec<String>>>;
+pub type AllowedIpsCache = TimedLru<SmolStr, Arc<Vec<String>>>;
+pub type RoleSecretCache = TimedLru<(SmolStr, SmolStr), Option<AuthSecret>>;
+pub type CachedRoleSecret = timed_lru::Cached<&'static RoleSecretCache>;

 /// This will allocate per each call, but the http requests alone
 /// already require a few allocations, so it should be fine.
 #[async_trait]
 pub trait Api {
    /// Get the client's auth secret for authentication.
-    async fn get_auth_info(
+    async fn get_role_secret(
        &self,
        extra: &ConsoleReqExtra,
        creds: &ComputeUserInfo,
-    ) -> Result<AuthInfo, errors::GetAuthInfoError>;
+    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;

    async fn get_allowed_ips(
        &self,
@@ -282,7 +286,9 @@ pub struct ApiCaches {
    /// Cache for the `wake_compute` API method.
    pub node_info: NodeInfoCache,
    /// Cache for the `get_allowed_ips`. TODO(anna): use notifications listener instead.
-    pub allowed_ips: TimedLru<Arc<str>, Arc<Vec<String>>>,
+    pub allowed_ips: AllowedIpsCache,
+    /// Cache for the `get_role_secret`. TODO(anna): use notifications listener instead.
+    pub role_secret: RoleSecretCache,
 }

 /// Various caches for [`console`](super).
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -6,6 +6,7 @@ use super::{
    errors::{ApiError, GetAuthInfoError, WakeComputeError},
    AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
 };
+use crate::console::provider::CachedRoleSecret;
 use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
 use async_trait::async_trait;
 use futures::TryFutureExt;
@@ -142,12 +143,14 @@ async fn get_execute_postgres_query(
 #[async_trait]
 impl super::Api for Api {
    #[tracing::instrument(skip_all)]
-    async fn get_auth_info(
+    async fn get_role_secret(
        &self,
        _extra: &ConsoleReqExtra,
        creds: &ComputeUserInfo,
-    ) -> Result<AuthInfo, GetAuthInfoError> {
-        self.do_get_auth_info(creds).await
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
+        Ok(CachedRoleSecret::new_uncached(
+            self.do_get_auth_info(creds).await?.secret,
+        ))
    }

    async fn get_allowed_ips(
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -3,9 +3,10 @@
 use super::{
    super::messages::{ConsoleError, GetRoleSecret, WakeCompute},
    errors::{ApiError, GetAuthInfoError, WakeComputeError},
-    ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
+    ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, CachedRoleSecret, ConsoleReqExtra,
+    NodeInfo,
 };
-use crate::proxy::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER};
+use crate::metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER};
 use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
@@ -159,12 +160,25 @@ impl Api {
 #[async_trait]
 impl super::Api for Api {
    #[tracing::instrument(skip_all)]
-    async fn get_auth_info(
+    async fn get_role_secret(
        &self,
        extra: &ConsoleReqExtra,
        creds: &ComputeUserInfo,
-    ) -> Result<AuthInfo, GetAuthInfoError> {
-        self.do_get_auth_info(extra, creds).await
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
+        let ep = creds.endpoint.clone();
+        let user = creds.inner.user.clone();
+        if let Some(role_secret) = self.caches.role_secret.get(&(ep.clone(), user.clone())) {
+            return Ok(role_secret);
+        }
+        let auth_info = self.do_get_auth_info(extra, creds).await?;
+        let (_, secret) = self
+            .caches
+            .role_secret
+            .insert((ep.clone(), user), auth_info.secret.clone());
+        self.caches
+            .allowed_ips
+            .insert(ep, Arc::new(auth_info.allowed_ips));
+        Ok(secret)
    }

    async fn get_allowed_ips(
@@ -172,8 +186,7 @@ impl super::Api for Api {
        extra: &ConsoleReqExtra,
        creds: &ComputeUserInfo,
    ) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
-        let key: &str = &creds.endpoint;
-        if let Some(allowed_ips) = self.caches.allowed_ips.get(key) {
+        if let Some(allowed_ips) = self.caches.allowed_ips.get(&creds.endpoint) {
            ALLOWED_IPS_BY_CACHE_OUTCOME
                .with_label_values(&["hit"])
                .inc();
@@ -182,10 +195,14 @@ impl super::Api for Api {
        ALLOWED_IPS_BY_CACHE_OUTCOME
            .with_label_values(&["miss"])
            .inc();
-        let allowed_ips = Arc::new(self.do_get_auth_info(extra, creds).await?.allowed_ips);
+        let auth_info = self.do_get_auth_info(extra, creds).await?;
+        let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let ep = creds.endpoint.clone();
+        let user = creds.inner.user.clone();
        self.caches
-            .allowed_ips
-            .insert(key.into(), allowed_ips.clone());
+            .role_secret
+            .insert((ep.clone(), user), auth_info.secret);
+        self.caches.allowed_ips.insert(ep, allowed_ips.clone());
        Ok(allowed_ips)
    }

--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -13,7 +13,7 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio::time::Instant;
 use tracing::trace;

-use crate::{proxy::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl};
+use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl};
 use reqwest_middleware::RequestBuilder;

 /// This is the preferred way to create new http clients,
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -16,6 +16,7 @@ pub mod console;
 pub mod error;
 pub mod http;
 pub mod logging;
+pub mod metrics;
 pub mod parse;
 pub mod protocol2;
 pub mod proxy;
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -0,0 +1,232 @@
+use ::metrics::{
+    exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec,
+    IntCounterPairVec, IntCounterVec,
+};
+use prometheus::{
+    register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec,
+    IntGaugeVec,
+};
+
+use once_cell::sync::Lazy;
+use tokio::time;
+
+pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "proxy_opened_db_connections_total",
+        "Number of opened connections to a database.",
+        "proxy_closed_db_connections_total",
+        "Number of closed connections to a database.",
+        &["protocol"],
+    )
+    .unwrap()
+});
+
+pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "proxy_opened_client_connections_total",
+        "Number of opened connections from a client.",
+        "proxy_closed_client_connections_total",
+        "Number of closed connections from a client.",
+        &["protocol"],
+    )
+    .unwrap()
+});
+
+pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "proxy_accepted_connections_total",
+        "Number of client connections accepted.",
+        "proxy_closed_connections_total",
+        "Number of client connections closed.",
+        &["protocol"],
+    )
+    .unwrap()
+});
+
+pub static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "proxy_compute_connection_latency_seconds",
+        "Time it took for proxy to establish a connection to the compute endpoint",
+        // http/ws/tcp, true/false, true/false, success/failure
+        // 3 * 2 * 2 * 2 = 24 counters
+        &["protocol", "cache_miss", "pool_miss", "outcome"],
+        // largest bucket = 2^16 * 0.5ms = 32s
+        exponential_buckets(0.0005, 2.0, 16).unwrap(),
+    )
+    .unwrap()
+});
+
+pub static CONSOLE_REQUEST_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "proxy_console_request_latency",
+        "Time it took for proxy to establish a connection to the compute endpoint",
+        // proxy_wake_compute/proxy_get_role_info
+        &["request"],
+        // largest bucket = 2^16 * 0.2ms = 13s
+        exponential_buckets(0.0002, 2.0, 16).unwrap(),
+    )
+    .unwrap()
+});
+
+pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_allowed_ips_cache_misses",
+        "Number of cache hits/misses for allowed ips",
+        // hit/miss
+        &["outcome"],
+    )
+    .unwrap()
+});
+
+pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "proxy_control_plane_token_acquire_seconds",
+        "Time it took for proxy to establish a connection to the compute endpoint",
+        // largest bucket = 3^16 * 0.05ms = 2.15s
+        exponential_buckets(0.00005, 3.0, 16).unwrap(),
+    )
+    .unwrap()
+});
+
+pub static RATE_LIMITER_LIMIT: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "semaphore_control_plane_limit",
+        "Current limit of the semaphore control plane",
+        &["limit"], // 2 counters
+    )
+    .unwrap()
+});
+
+pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_accepted_connections_by_sni",
+        "Number of connections (per sni).",
+        &["kind"],
+    )
+    .unwrap()
+});
+
+pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "proxy_allowed_ips_number",
+        "Number of allowed ips",
+        vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0],
+    )
+    .unwrap()
+});
+
+pub struct LatencyTimer {
+    // time since the stopwatch was started
+    start: Option<time::Instant>,
+    // accumulated time on the stopwatch
+    accumulated: std::time::Duration,
+    // label data
+    protocol: &'static str,
+    cache_miss: bool,
+    pool_miss: bool,
+    outcome: &'static str,
+}
+
+pub struct LatencyTimerPause<'a> {
+    timer: &'a mut LatencyTimer,
+}
+
+impl LatencyTimer {
+    pub fn new(protocol: &'static str) -> Self {
+        Self {
+            start: Some(time::Instant::now()),
+            accumulated: std::time::Duration::ZERO,
+            protocol,
+            cache_miss: false,
+            // by default we don't do pooling
+            pool_miss: true,
+            // assume failed unless otherwise specified
+            outcome: "failed",
+        }
+    }
+
+    pub fn pause(&mut self) -> LatencyTimerPause<'_> {
+        // stop the stopwatch and record the time that we have accumulated
+        let start = self.start.take().expect("latency timer should be started");
+        self.accumulated += start.elapsed();
+        LatencyTimerPause { timer: self }
+    }
+
+    pub fn cache_miss(&mut self) {
+        self.cache_miss = true;
+    }
+
+    pub fn pool_hit(&mut self) {
+        self.pool_miss = false;
+    }
+
+    pub fn success(mut self) {
+        self.outcome = "success";
+    }
+}
+
+impl Drop for LatencyTimerPause<'_> {
+    fn drop(&mut self) {
+        // start the stopwatch again
+        self.timer.start = Some(time::Instant::now());
+    }
+}
+
+impl Drop for LatencyTimer {
+    fn drop(&mut self) {
+        let duration =
+            self.start.map(|start| start.elapsed()).unwrap_or_default() + self.accumulated;
+        COMPUTE_CONNECTION_LATENCY
+            .with_label_values(&[
+                self.protocol,
+                bool_to_str(self.cache_miss),
+                bool_to_str(self.pool_miss),
+                self.outcome,
+            ])
+            .observe(duration.as_secs_f64())
+    }
+}
+
+pub static NUM_CONNECTION_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_connection_failures_total",
+        "Number of connection failures (per kind).",
+        &["kind"],
+    )
+    .unwrap()
+});
+
+pub static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_connection_failures_breakdown",
+        "Number of wake-up failures (per kind).",
+        &["retry", "kind"],
+    )
+    .unwrap()
+});
+
+pub static NUM_BYTES_PROXIED_PER_CLIENT_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_io_bytes_per_client",
+        "Number of bytes sent/received between client and backend.",
+        crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS,
+    )
+    .unwrap()
+});
+
+pub static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_io_bytes",
+        "Number of bytes sent/received between all clients and backends.",
+        &["direction"],
+    )
+    .unwrap()
+});
+
+pub const fn bool_to_str(x: bool) -> &'static str {
+    if x {
+        "true"
+    } else {
+        "false"
+    }
+}
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -1,283 +1,41 @@
 #[cfg(test)]
 mod tests;

+pub mod connect_compute;
+pub mod retry;
+
 use crate::{
    auth,
    cancellation::{self, CancelMap},
-    compute::{self, PostgresConnection},
+    compute,
    config::{AuthenticationConfig, ProxyConfig, TlsConfig},
-    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
-    http::StatusCode,
+    console::{self, messages::MetricsAuxInfo},
+    metrics::{
+        LatencyTimer, NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER,
+        NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE,
+    },
    protocol2::WithClientIp,
    rate_limiter::EndpointRateLimiter,
    stream::{PqStream, Stream},
    usage_metrics::{Ids, USAGE_METRICS},
 };
 use anyhow::{bail, Context};
-use async_trait::async_trait;
 use futures::TryFutureExt;
 use itertools::Itertools;
-use metrics::{exponential_buckets, register_int_counter_vec, IntCounterVec};
-use once_cell::sync::{Lazy, OnceCell};
+use once_cell::sync::OnceCell;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
-use prometheus::{
-    register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec,
-    IntGaugeVec,
-};
 use regex::Regex;
-use std::{error::Error, io, net::IpAddr, ops::ControlFlow, sync::Arc, time::Instant};
-use tokio::{
-    io::{AsyncRead, AsyncWrite, AsyncWriteExt},
-    time,
-};
+use std::{net::IpAddr, sync::Arc};
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, info_span, warn, Instrument};
+use tracing::{error, info, info_span, Instrument};
 use utils::measured_stream::MeasuredStream;

-/// Number of times we should retry the `/proxy_wake_compute` http request.
-/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0
-pub const NUM_RETRIES_CONNECT: u32 = 16;
-const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
-const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25);
-const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;
+use self::connect_compute::{connect_to_compute, TcpMechanism};

 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";

-pub static NUM_DB_CONNECTIONS_OPENED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_opened_db_connections_total",
-        "Number of opened connections to a database.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-pub static NUM_DB_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_closed_db_connections_total",
-        "Number of closed connections to a database.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-pub static NUM_CLIENT_CONNECTION_OPENED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_opened_client_connections_total",
-        "Number of opened connections from a client.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-pub static NUM_CLIENT_CONNECTION_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_closed_client_connections_total",
-        "Number of closed connections from a client.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-pub static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_accepted_connections_total",
-        "Number of client connections accepted.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-pub static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_closed_connections_total",
-        "Number of client connections closed.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "proxy_compute_connection_latency_seconds",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // http/ws/tcp, true/false, true/false, success/failure
-        // 3 * 2 * 2 * 2 = 24 counters
-        &["protocol", "cache_miss", "pool_miss", "outcome"],
-        // largest bucket = 2^16 * 0.5ms = 32s
-        exponential_buckets(0.0005, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
-
-pub static CONSOLE_REQUEST_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "proxy_console_request_latency",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // proxy_wake_compute/proxy_get_role_info
-        &["request"],
-        // largest bucket = 2^16 * 0.2ms = 13s
-        exponential_buckets(0.0002, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
-
-pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_allowed_ips_cache_misses",
-        "Number of cache hits/misses for allowed ips",
-        // hit/miss
-        &["outcome"],
-    )
-    .unwrap()
-});
-
-pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_control_plane_token_acquire_seconds",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // largest bucket = 3^16 * 0.05ms = 2.15s
-        exponential_buckets(0.00005, 3.0, 16).unwrap(),
-    )
-    .unwrap()
-});
-
-pub static RATE_LIMITER_LIMIT: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "semaphore_control_plane_limit",
-        "Current limit of the semaphore control plane",
-        &["limit"], // 2 counters
-    )
-    .unwrap()
-});
-
-pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_accepted_connections_by_sni",
-        "Number of connections (per sni).",
-        &["kind"],
-    )
-    .unwrap()
-});
-
-pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_allowed_ips_number",
-        "Number of allowed ips",
-        vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0],
-    )
-    .unwrap()
-});
-
-pub struct LatencyTimer {
-    // time since the stopwatch was started
-    start: Option<Instant>,
-    // accumulated time on the stopwatch
-    accumulated: std::time::Duration,
-    // label data
-    protocol: &'static str,
-    cache_miss: bool,
-    pool_miss: bool,
-    outcome: &'static str,
-}
-
-pub struct LatencyTimerPause<'a> {
-    timer: &'a mut LatencyTimer,
-}
-
-impl LatencyTimer {
-    pub fn new(protocol: &'static str) -> Self {
-        Self {
-            start: Some(Instant::now()),
-            accumulated: std::time::Duration::ZERO,
-            protocol,
-            cache_miss: false,
-            // by default we don't do pooling
-            pool_miss: true,
-            // assume failed unless otherwise specified
-            outcome: "failed",
-        }
-    }
-
-    pub fn pause(&mut self) -> LatencyTimerPause<'_> {
-        // stop the stopwatch and record the time that we have accumulated
-        let start = self.start.take().expect("latency timer should be started");
-        self.accumulated += start.elapsed();
-        LatencyTimerPause { timer: self }
-    }
-
-    pub fn cache_miss(&mut self) {
-        self.cache_miss = true;
-    }
-
-    pub fn pool_hit(&mut self) {
-        self.pool_miss = false;
-    }
-
-    pub fn success(mut self) {
-        self.outcome = "success";
-    }
-}
-
-impl Drop for LatencyTimerPause<'_> {
-    fn drop(&mut self) {
-        // start the stopwatch again
-        self.timer.start = Some(Instant::now());
-    }
-}
-
-impl Drop for LatencyTimer {
-    fn drop(&mut self) {
-        let duration =
-            self.start.map(|start| start.elapsed()).unwrap_or_default() + self.accumulated;
-        COMPUTE_CONNECTION_LATENCY
-            .with_label_values(&[
-                self.protocol,
-                bool_to_str(self.cache_miss),
-                bool_to_str(self.pool_miss),
-                self.outcome,
-            ])
-            .observe(duration.as_secs_f64())
-    }
-}
-
-static NUM_CONNECTION_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_connection_failures_total",
-        "Number of connection failures (per kind).",
-        &["kind"],
-    )
-    .unwrap()
-});
-
-static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_connection_failures_breakdown",
-        "Number of wake-up failures (per kind).",
-        &["retry", "kind"],
-    )
-    .unwrap()
-});
-
-static NUM_BYTES_PROXIED_PER_CLIENT_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_io_bytes_per_client",
-        "Number of bytes sent/received between client and backend.",
-        crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS,
-    )
-    .unwrap()
-});
-
-static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_io_bytes",
-        "Number of bytes sent/received between all clients and backends.",
-        &["direction"],
-    )
-    .unwrap()
-});
-
 pub async fn run_until_cancelled<F: std::future::Future>(
    f: F,
    cancellation_token: &CancellationToken,
@@ -297,6 +55,7 @@ pub async fn task_main(
    config: &'static ProxyConfig,
    listener: tokio::net::TcpListener,
    cancellation_token: CancellationToken,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
    scopeguard::defer! {
        info!("proxy has shut down");
@@ -308,7 +67,6 @@ pub async fn task_main(

    let connections = tokio_util::task::task_tracker::TaskTracker::new();
    let cancel_map = Arc::new(CancelMap::default());
-    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(config.endpoint_rps_limit));

    while let Some(accept_result) =
        run_until_cancelled(listener.accept(), &cancellation_token).await
@@ -428,16 +186,12 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    );

    let proto = mode.protocol_label();
-    NUM_CLIENT_CONNECTION_OPENED_COUNTER
+    let _client_gauge = NUM_CLIENT_CONNECTION_GAUGE
        .with_label_values(&[proto])
-        .inc();
-    NUM_CONNECTIONS_ACCEPTED_COUNTER
+        .guard();
+    let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
        .with_label_values(&[proto])
-        .inc();
-    scopeguard::defer! {
-        NUM_CLIENT_CONNECTION_CLOSED_COUNTER.with_label_values(&[proto]).inc();
-        NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[proto]).inc();
-    }
+        .guard();

    let tls = config.tls_config.as_ref();

@@ -561,294 +315,6 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
    }
 }

-/// If we couldn't connect, a cached connection info might be to blame
-/// (e.g. the compute node's address might've changed at the wrong time).
-/// Invalidate the cache entry (if any) to prevent subsequent errors.
-#[tracing::instrument(name = "invalidate_cache", skip_all)]
-pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg {
-    let is_cached = node_info.cached();
-    if is_cached {
-        warn!("invalidating stalled compute node info cache entry");
-    }
-    let label = match is_cached {
-        true => "compute_cached",
-        false => "compute_uncached",
-    };
-    NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
-
-    node_info.invalidate().config
-}
-
-/// Try to connect to the compute node once.
-#[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)]
-async fn connect_to_compute_once(
-    node_info: &console::CachedNodeInfo,
-    timeout: time::Duration,
-) -> Result<PostgresConnection, compute::ConnectionError> {
-    let allow_self_signed_compute = node_info.allow_self_signed_compute;
-
-    node_info
-        .config
-        .connect(allow_self_signed_compute, timeout)
-        .await
-}
-
-#[async_trait]
-pub trait ConnectMechanism {
-    type Connection;
-    type ConnectError;
-    type Error: From<Self::ConnectError>;
-    async fn connect_once(
-        &self,
-        node_info: &console::CachedNodeInfo,
-        timeout: time::Duration,
-    ) -> Result<Self::Connection, Self::ConnectError>;
-
-    fn update_connect_config(&self, conf: &mut compute::ConnCfg);
-}
-
-pub struct TcpMechanism<'a> {
-    /// KV-dictionary with PostgreSQL connection params.
-    pub params: &'a StartupMessageParams,
-}
-
-#[async_trait]
-impl ConnectMechanism for TcpMechanism<'_> {
-    type Connection = PostgresConnection;
-    type ConnectError = compute::ConnectionError;
-    type Error = compute::ConnectionError;
-
-    async fn connect_once(
-        &self,
-        node_info: &console::CachedNodeInfo,
-        timeout: time::Duration,
-    ) -> Result<PostgresConnection, Self::Error> {
-        connect_to_compute_once(node_info, timeout).await
-    }
-
-    fn update_connect_config(&self, config: &mut compute::ConnCfg) {
-        config.set_startup_params(self.params);
-    }
-}
-
-const fn bool_to_str(x: bool) -> &'static str {
-    if x {
-        "true"
-    } else {
-        "false"
-    }
-}
-
-fn report_error(e: &WakeComputeError, retry: bool) {
-    use crate::console::errors::ApiError;
-    let retry = bool_to_str(retry);
-    let kind = match e {
-        WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
-        WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::LOCKED,
-            ref text,
-        }) if text.contains("written data quota exceeded")
-            || text.contains("the limit for current plan reached") =>
-        {
-            "quota_exceeded"
-        }
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::LOCKED,
-            ..
-        }) => "api_console_locked",
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::BAD_REQUEST,
-            ..
-        }) => "api_console_bad_request",
-        WakeComputeError::ApiError(ApiError::Console { status, .. })
-            if status.is_server_error() =>
-        {
-            "api_console_other_server_error"
-        }
-        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
-        WakeComputeError::TimeoutError => "timeout_error",
-    };
-    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
-}
-
-/// Try to connect to the compute node, retrying if necessary.
-/// This function might update `node_info`, so we take it by `&mut`.
-#[tracing::instrument(skip_all)]
-pub async fn connect_to_compute<M: ConnectMechanism>(
-    mechanism: &M,
-    mut node_info: console::CachedNodeInfo,
-    extra: &console::ConsoleReqExtra,
-    creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>,
-    mut latency_timer: LatencyTimer,
-) -> Result<M::Connection, M::Error>
-where
-    M::ConnectError: ShouldRetry + std::fmt::Debug,
-    M::Error: From<WakeComputeError>,
-{
-    mechanism.update_connect_config(&mut node_info.config);
-
-    // try once
-    let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-        Ok(res) => {
-            latency_timer.success();
-            return Ok(res);
-        }
-        Err(e) => {
-            error!(error = ?e, "could not connect to compute node");
-            (invalidate_cache(node_info), e)
-        }
-    };
-
-    latency_timer.cache_miss();
-
-    let mut num_retries = 1;
-
-    // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
-    info!("compute node's state has likely changed; requesting a wake-up");
-    let node_info = loop {
-        let wake_res = match creds {
-            auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
-            #[cfg(feature = "testing")]
-            auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
-            // nothing to do?
-            auth::BackendType::Link(_) => return Err(err.into()),
-            // test backend
-            #[cfg(test)]
-            auth::BackendType::Test(x) => x.wake_compute(),
-        };
-
-        match handle_try_wake(wake_res, num_retries) {
-            Err(e) => {
-                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
-                report_error(&e, false);
-                return Err(e.into());
-            }
-            // failed to wake up but we can continue to retry
-            Ok(ControlFlow::Continue(e)) => {
-                report_error(&e, true);
-                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
-            }
-            // successfully woke up a compute node and can break the wakeup loop
-            Ok(ControlFlow::Break(mut node_info)) => {
-                node_info.config.reuse_password(&config);
-                mechanism.update_connect_config(&mut node_info.config);
-                break node_info;
-            }
-        }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-
-        time::sleep(wait_duration).await;
-    };
-
-    // now that we have a new node, try connect to it repeatedly.
-    // this can error for a few reasons, for instance:
-    // * DNS connection settings haven't quite propagated yet
-    info!("wake_compute success. attempting to connect");
-    loop {
-        match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-            Ok(res) => {
-                latency_timer.success();
-                return Ok(res);
-            }
-            Err(e) => {
-                let retriable = e.should_retry(num_retries);
-                if !retriable {
-                    error!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
-                    return Err(e.into());
-                }
-                warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
-            }
-        }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-
-        time::sleep(wait_duration).await;
-    }
-}
-
-/// Attempts to wake up the compute node.
-/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
-/// * Returns Ok(Break(node)) if the wakeup succeeded
-/// * Returns Err(e) if there was an error
-pub fn handle_try_wake(
-    result: Result<console::CachedNodeInfo, WakeComputeError>,
-    num_retries: u32,
-) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
-    match result {
-        Err(err) => match &err {
-            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
-                Ok(ControlFlow::Continue(err))
-            }
-            _ => Err(err),
-        },
-        // Ready to try again.
-        Ok(new) => Ok(ControlFlow::Break(new)),
-    }
-}
-
-pub trait ShouldRetry {
-    fn could_retry(&self) -> bool;
-    fn should_retry(&self, num_retries: u32) -> bool {
-        match self {
-            _ if num_retries >= NUM_RETRIES_CONNECT => false,
-            err => err.could_retry(),
-        }
-    }
-}
-
-impl ShouldRetry for io::Error {
-    fn could_retry(&self) -> bool {
-        use std::io::ErrorKind;
-        matches!(
-            self.kind(),
-            ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut
-        )
-    }
-}
-
-impl ShouldRetry for tokio_postgres::error::DbError {
-    fn could_retry(&self) -> bool {
-        use tokio_postgres::error::SqlState;
-        matches!(
-            self.code(),
-            &SqlState::CONNECTION_FAILURE
-                | &SqlState::CONNECTION_EXCEPTION
-                | &SqlState::CONNECTION_DOES_NOT_EXIST
-                | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
-        )
-    }
-}
-
-impl ShouldRetry for tokio_postgres::Error {
-    fn could_retry(&self) -> bool {
-        if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
-            io::Error::could_retry(io_err)
-        } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
-            tokio_postgres::error::DbError::could_retry(db_err)
-        } else {
-            false
-        }
-    }
-}
-
-impl ShouldRetry for compute::ConnectionError {
-    fn could_retry(&self) -> bool {
-        match self {
-            compute::ConnectionError::Postgres(err) => err.could_retry(),
-            compute::ConnectionError::CouldNotConnect(err) => err.could_retry(),
-            _ => false,
-        }
-    }
-}
-
-pub fn retry_after(num_retries: u32) -> time::Duration {
-    BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1))
-}
-
 /// Finish client connection initialization: confirm auth success, send params, etc.
 #[tracing::instrument(skip_all)]
 async fn prepare_client_connection(
@@ -1028,7 +494,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {

        let aux = node_info.aux.clone();
        let mut node = connect_to_compute(
-            &TcpMechanism { params },
+            &TcpMechanism { params, proto },
            node_info,
            &extra,
            &creds,
@@ -1037,13 +503,6 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
        .or_else(|e| stream.throw_error(e))
        .await?;

-        NUM_DB_CONNECTIONS_OPENED_COUNTER
-            .with_label_values(&[proto])
-            .inc();
-        scopeguard::defer! {
-            NUM_DB_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[proto]).inc();
-        }
-
        prepare_client_connection(&node, session, &mut stream).await?;
        // Before proxy passing, forward to compute whatever data is left in the
        // PqStream input buffer. Normally there is none, but our serverless npm
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -0,0 +1,238 @@
+use crate::{
+    auth,
+    compute::{self, PostgresConnection},
+    console::{self, errors::WakeComputeError, Api},
+    metrics::{bool_to_str, LatencyTimer, NUM_CONNECTION_FAILURES, NUM_WAKEUP_FAILURES},
+    proxy::retry::{retry_after, ShouldRetry},
+};
+use async_trait::async_trait;
+use hyper::StatusCode;
+use pq_proto::StartupMessageParams;
+use std::ops::ControlFlow;
+use tokio::time;
+use tracing::{error, info, warn};
+
+const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
+
+/// If we couldn't connect, a cached connection info might be to blame
+/// (e.g. the compute node's address might've changed at the wrong time).
+/// Invalidate the cache entry (if any) to prevent subsequent errors.
+#[tracing::instrument(name = "invalidate_cache", skip_all)]
+pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg {
+    let is_cached = node_info.cached();
+    if is_cached {
+        warn!("invalidating stalled compute node info cache entry");
+    }
+    let label = match is_cached {
+        true => "compute_cached",
+        false => "compute_uncached",
+    };
+    NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
+
+    node_info.invalidate().config
+}
+
+/// Try to connect to the compute node once.
+#[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)]
+async fn connect_to_compute_once(
+    node_info: &console::CachedNodeInfo,
+    timeout: time::Duration,
+    proto: &'static str,
+) -> Result<PostgresConnection, compute::ConnectionError> {
+    let allow_self_signed_compute = node_info.allow_self_signed_compute;
+
+    node_info
+        .config
+        .connect(allow_self_signed_compute, timeout, proto)
+        .await
+}
+
+#[async_trait]
+pub trait ConnectMechanism {
+    type Connection;
+    type ConnectError;
+    type Error: From<Self::ConnectError>;
+    async fn connect_once(
+        &self,
+        node_info: &console::CachedNodeInfo,
+        timeout: time::Duration,
+    ) -> Result<Self::Connection, Self::ConnectError>;
+
+    fn update_connect_config(&self, conf: &mut compute::ConnCfg);
+}
+
+pub struct TcpMechanism<'a> {
+    /// KV-dictionary with PostgreSQL connection params.
+    pub params: &'a StartupMessageParams,
+    pub proto: &'static str,
+}
+
+#[async_trait]
+impl ConnectMechanism for TcpMechanism<'_> {
+    type Connection = PostgresConnection;
+    type ConnectError = compute::ConnectionError;
+    type Error = compute::ConnectionError;
+
+    async fn connect_once(
+        &self,
+        node_info: &console::CachedNodeInfo,
+        timeout: time::Duration,
+    ) -> Result<PostgresConnection, Self::Error> {
+        connect_to_compute_once(node_info, timeout, self.proto).await
+    }
+
+    fn update_connect_config(&self, config: &mut compute::ConnCfg) {
+        config.set_startup_params(self.params);
+    }
+}
+
+fn report_error(e: &WakeComputeError, retry: bool) {
+    use crate::console::errors::ApiError;
+    let retry = bool_to_str(retry);
+    let kind = match e {
+        WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
+        WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::LOCKED,
+            ref text,
+        }) if text.contains("written data quota exceeded")
+            || text.contains("the limit for current plan reached") =>
+        {
+            "quota_exceeded"
+        }
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::LOCKED,
+            ..
+        }) => "api_console_locked",
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::BAD_REQUEST,
+            ..
+        }) => "api_console_bad_request",
+        WakeComputeError::ApiError(ApiError::Console { status, .. })
+            if status.is_server_error() =>
+        {
+            "api_console_other_server_error"
+        }
+        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
+        WakeComputeError::TimeoutError => "timeout_error",
+    };
+    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
+}
+
+/// Try to connect to the compute node, retrying if necessary.
+/// This function might update `node_info`, so we take it by `&mut`.
+#[tracing::instrument(skip_all)]
+pub async fn connect_to_compute<M: ConnectMechanism>(
+    mechanism: &M,
+    mut node_info: console::CachedNodeInfo,
+    extra: &console::ConsoleReqExtra,
+    creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>,
+    mut latency_timer: LatencyTimer,
+) -> Result<M::Connection, M::Error>
+where
+    M::ConnectError: ShouldRetry + std::fmt::Debug,
+    M::Error: From<WakeComputeError>,
+{
+    mechanism.update_connect_config(&mut node_info.config);
+
+    // try once
+    let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+        Ok(res) => {
+            latency_timer.success();
+            return Ok(res);
+        }
+        Err(e) => {
+            error!(error = ?e, "could not connect to compute node");
+            (invalidate_cache(node_info), e)
+        }
+    };
+
+    latency_timer.cache_miss();
+
+    let mut num_retries = 1;
+
+    // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
+    info!("compute node's state has likely changed; requesting a wake-up");
+    let node_info = loop {
+        let wake_res = match creds {
+            auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
+            #[cfg(feature = "testing")]
+            auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
+            // nothing to do?
+            auth::BackendType::Link(_) => return Err(err.into()),
+            // test backend
+            #[cfg(test)]
+            auth::BackendType::Test(x) => x.wake_compute(),
+        };
+
+        match handle_try_wake(wake_res, num_retries) {
+            Err(e) => {
+                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
+                report_error(&e, false);
+                return Err(e.into());
+            }
+            // failed to wake up but we can continue to retry
+            Ok(ControlFlow::Continue(e)) => {
+                report_error(&e, true);
+                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
+            }
+            // successfully woke up a compute node and can break the wakeup loop
+            Ok(ControlFlow::Break(mut node_info)) => {
+                node_info.config.reuse_password(&config);
+                mechanism.update_connect_config(&mut node_info.config);
+                break node_info;
+            }
+        }
+
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;
+
+        time::sleep(wait_duration).await;
+    };
+
+    // now that we have a new node, try connect to it repeatedly.
+    // this can error for a few reasons, for instance:
+    // * DNS connection settings haven't quite propagated yet
+    info!("wake_compute success. attempting to connect");
+    loop {
+        match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+            Ok(res) => {
+                latency_timer.success();
+                return Ok(res);
+            }
+            Err(e) => {
+                let retriable = e.should_retry(num_retries);
+                if !retriable {
+                    error!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
+                    return Err(e.into());
+                }
+                warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
+            }
+        }
+
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;
+
+        time::sleep(wait_duration).await;
+    }
+}
+
+/// Attempts to wake up the compute node.
+/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
+/// * Returns Ok(Break(node)) if the wakeup succeeded
+/// * Returns Err(e) if there was an error
+pub fn handle_try_wake(
+    result: Result<console::CachedNodeInfo, WakeComputeError>,
+    num_retries: u32,
+) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
+    match result {
+        Err(err) => match &err {
+            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
+                Ok(ControlFlow::Continue(err))
+            }
+            _ => Err(err),
+        },
+        // Ready to try again.
+        Ok(new) => Ok(ControlFlow::Break(new)),
+    }
+}
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -0,0 +1,68 @@
+use crate::compute;
+use std::{error::Error, io};
+use tokio::time;
+
+/// Number of times we should retry the `/proxy_wake_compute` http request.
+/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0
+pub const NUM_RETRIES_CONNECT: u32 = 16;
+const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25);
+const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;
+
+pub trait ShouldRetry {
+    fn could_retry(&self) -> bool;
+    fn should_retry(&self, num_retries: u32) -> bool {
+        match self {
+            _ if num_retries >= NUM_RETRIES_CONNECT => false,
+            err => err.could_retry(),
+        }
+    }
+}
+
+impl ShouldRetry for io::Error {
+    fn could_retry(&self) -> bool {
+        use std::io::ErrorKind;
+        matches!(
+            self.kind(),
+            ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut
+        )
+    }
+}
+
+impl ShouldRetry for tokio_postgres::error::DbError {
+    fn could_retry(&self) -> bool {
+        use tokio_postgres::error::SqlState;
+        matches!(
+            self.code(),
+            &SqlState::CONNECTION_FAILURE
+                | &SqlState::CONNECTION_EXCEPTION
+                | &SqlState::CONNECTION_DOES_NOT_EXIST
+                | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
+        )
+    }
+}
+
+impl ShouldRetry for tokio_postgres::Error {
+    fn could_retry(&self) -> bool {
+        if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
+            io::Error::could_retry(io_err)
+        } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
+            tokio_postgres::error::DbError::could_retry(db_err)
+        } else {
+            false
+        }
+    }
+}
+
+impl ShouldRetry for compute::ConnectionError {
+    fn could_retry(&self) -> bool {
+        match self {
+            compute::ConnectionError::Postgres(err) => err.could_retry(),
+            compute::ConnectionError::CouldNotConnect(err) => err.could_retry(),
+            _ => false,
+        }
+    }
+}
+
+pub fn retry_after(num_retries: u32) -> time::Duration {
+    BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1))
+}
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -2,10 +2,13 @@

 mod mitm;

+use super::connect_compute::ConnectMechanism;
+use super::retry::ShouldRetry;
 use super::*;
 use crate::auth::backend::{ComputeUserInfo, TestBackend};
 use crate::config::CertResolver;
 use crate::console::{CachedNodeInfo, NodeInfo};
+use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
 use crate::{auth, http, sasl, scram};
 use async_trait::async_trait;
 use rstest::rstest;
@@ -423,7 +426,7 @@ impl ConnectMechanism for TestConnectMechanism {
    async fn connect_once(
        &self,
        _node_info: &console::CachedNodeInfo,
-        _timeout: time::Duration,
+        _timeout: std::time::Duration,
    ) -> Result<Self::Connection, Self::ConnectError> {
        let mut counter = self.counter.lock().unwrap();
        let action = self.sequence[*counter];
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -120,7 +120,7 @@ where
 struct PgFrame;
 impl Decoder for PgFrame {
    type Item = Bytes;
-    type Error = io::Error;
+    type Error = std::io::Error;

    fn decode(&mut self, src: &mut BytesMut) -> Result<Option<Self::Item>, Self::Error> {
        if src.len() < 5 {
@@ -136,7 +136,7 @@ impl Decoder for PgFrame {
    }
 }
 impl Encoder<Bytes> for PgFrame {
-    type Error = io::Error;
+    type Error = std::io::Error;

    fn encode(&mut self, item: Bytes, dst: &mut BytesMut) -> Result<(), Self::Error> {
        dst.extend_from_slice(&item);
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -3,5 +3,5 @@ mod limit_algorithm;
 mod limiter;
 pub use aimd::Aimd;
 pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
-pub use limiter::EndpointRateLimiter;
 pub use limiter::Limiter;
+pub use limiter::{EndpointRateLimiter, RateBucketInfo};
--- a/proxy/src/rate_limiter/aimd.rs
+++ b/proxy/src/rate_limiter/aimd.rs
@@ -33,39 +33,6 @@ impl Aimd {
            min_utilisation_threshold: config.aimd_min_utilisation_threshold,
        }
    }
-
-    pub fn decrease_factor(self, factor: f32) -> Self {
-        assert!((0.5..1.0).contains(&factor));
-        Self {
-            decrease_factor: factor,
-            ..self
-        }
-    }
-
-    pub fn increase_by(self, increase: usize) -> Self {
-        assert!(increase > 0);
-        Self {
-            increase_by: increase,
-            ..self
-        }
-    }
-
-    pub fn with_max_limit(self, max: usize) -> Self {
-        assert!(max > 0);
-        Self {
-            max_limit: max,
-            ..self
-        }
-    }
-
-    /// A threshold below which the limit won't be increased. 0.5 = 50%.
-    pub fn with_min_utilisation_threshold(self, min_util: f32) -> Self {
-        assert!(min_util > 0. && min_util < 1.);
-        Self {
-            min_utilisation_threshold: min_util,
-            ..self
-        }
-    }
 }

 #[async_trait]
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -1,16 +1,19 @@
 use std::{
+    collections::hash_map::RandomState,
+    hash::BuildHasher,
    sync::{
        atomic::{AtomicUsize, Ordering},
-        Arc,
+        Arc, Mutex,
    },
-    time::Duration,
 };

+use anyhow::bail;
 use dashmap::DashMap;
-use parking_lot::Mutex;
+use itertools::Itertools;
+use rand::{rngs::StdRng, Rng, SeedableRng};
 use smol_str::SmolStr;
 use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
-use tokio::time::{timeout, Instant};
+use tokio::time::{timeout, Duration, Instant};
 use tracing::info;

 use super::{
@@ -29,60 +32,166 @@ use super::{
 // saw SNI, before doing TLS handshake. User-side error messages in that case
 // does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now
 // I went with a more expensive way that yields user-friendlier error messages.
-//
-// TODO: add a better bucketing here, e.g. not more than 300 requests per second,
-//       and not more than 1000 requests per 10 seconds, etc. Short bursts of reconnects
-//       are noramal during redeployments, so we should not block them.
-pub struct EndpointRateLimiter {
-    map: DashMap<SmolStr, Arc<Mutex<(chrono::NaiveTime, u32)>>>,
-    max_rps: u32,
+pub struct EndpointRateLimiter<Rand = StdRng, Hasher = RandomState> {
+    map: DashMap<SmolStr, Vec<RateBucket>, Hasher>,
+    info: &'static [RateBucketInfo],
    access_count: AtomicUsize,
+    rand: Mutex<Rand>,
+}
+
+#[derive(Clone, Copy)]
+struct RateBucket {
+    start: Instant,
+    count: u32,
+}
+
+impl RateBucket {
+    fn should_allow_request(&mut self, info: &RateBucketInfo, now: Instant) -> bool {
+        if now - self.start < info.interval {
+            self.count < info.max_rpi
+        } else {
+            // bucket expired, reset
+            self.count = 0;
+            self.start = now;
+
+            true
+        }
+    }
+
+    fn inc(&mut self) {
+        self.count += 1;
+    }
+}
+
+#[derive(Clone, Copy, PartialEq)]
+pub struct RateBucketInfo {
+    pub interval: Duration,
+    // requests per interval
+    pub max_rpi: u32,
+}
+
+impl std::fmt::Display for RateBucketInfo {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let rps = self.max_rpi * 1000 / self.interval.as_millis() as u32;
+        write!(f, "{rps}@{}", humantime::format_duration(self.interval))
+    }
+}
+
+impl std::fmt::Debug for RateBucketInfo {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{self}")
+    }
+}
+
+impl std::str::FromStr for RateBucketInfo {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let Some((max_rps, interval)) = s.split_once('@') else {
+            bail!("invalid rate info")
+        };
+        let max_rps = max_rps.parse()?;
+        let interval = humantime::parse_duration(interval)?;
+        Ok(Self::new(max_rps, interval))
+    }
+}
+
+impl RateBucketInfo {
+    pub const DEFAULT_SET: [Self; 3] = [
+        Self::new(300, Duration::from_secs(1)),
+        Self::new(200, Duration::from_secs(60)),
+        Self::new(100, Duration::from_secs(600)),
+    ];
+
+    pub fn validate(info: &mut [Self]) -> anyhow::Result<()> {
+        info.sort_unstable_by_key(|info| info.interval);
+        let invalid = info
+            .iter()
+            .tuple_windows()
+            .find(|(a, b)| a.max_rpi > b.max_rpi);
+        if let Some((a, b)) = invalid {
+            bail!(
+                "invalid endpoint RPS limits. {b} allows fewer requests per bucket than {a} ({} vs {})",
+                b.max_rpi,
+                a.max_rpi,
+            );
+        }
+
+        Ok(())
+    }
+
+    pub const fn new(max_rps: u32, interval: Duration) -> Self {
+        Self {
+            interval,
+            max_rpi: max_rps * interval.as_millis() as u32 / 1000,
+        }
+    }
 }

 impl EndpointRateLimiter {
-    pub fn new(max_rps: u32) -> Self {
+    pub fn new(info: &'static [RateBucketInfo]) -> Self {
+        Self::new_with_rand_and_hasher(info, StdRng::from_entropy(), RandomState::new())
+    }
+}
+
+impl<R: Rng, S: BuildHasher + Clone> EndpointRateLimiter<R, S> {
+    fn new_with_rand_and_hasher(info: &'static [RateBucketInfo], rand: R, hasher: S) -> Self {
+        info!(buckets = ?info, "endpoint rate limiter");
        Self {
-            map: DashMap::new(),
-            max_rps,
+            info,
+            map: DashMap::with_hasher_and_shard_amount(hasher, 64),
            access_count: AtomicUsize::new(1), // start from 1 to avoid GC on the first request
+            rand: Mutex::new(rand),
        }
    }

    /// Check that number of connections to the endpoint is below `max_rps` rps.
    pub fn check(&self, endpoint: SmolStr) -> bool {
-        // do GC every 100k requests (worst case memory usage is about 10MB)
-        if self.access_count.fetch_add(1, Ordering::AcqRel) % 100_000 == 0 {
+        // do a partial GC every 2k requests. This cleans up ~ 1/64th of the map.
+        // worst case memory usage is about:
+        //    = 2 * 2048 * 64 * (48B + 72B)
+        //    = 30MB
+        if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 {
            self.do_gc();
        }

-        let now = chrono::Utc::now().naive_utc().time();
-        let entry = self
-            .map
-            .entry(endpoint)
-            .or_insert_with(|| Arc::new(Mutex::new((now, 0))));
-        let mut entry = entry.lock();
-        let (last_time, count) = *entry;
+        let now = Instant::now();
+        let mut entry = self.map.entry(endpoint).or_insert_with(|| {
+            vec![
+                RateBucket {
+                    start: now,
+                    count: 0,
+                };
+                self.info.len()
+            ]
+        });

-        if now - last_time < chrono::Duration::seconds(1) {
-            if count >= self.max_rps {
-                return false;
-            }
-            *entry = (last_time, count + 1);
-        } else {
-            *entry = (now, 1);
+        let should_allow_request = entry
+            .iter_mut()
+            .zip(self.info)
+            .all(|(bucket, info)| bucket.should_allow_request(info, now));
+
+        if should_allow_request {
+            // only increment the bucket counts if the request will actually be accepted
+            entry.iter_mut().for_each(RateBucket::inc);
        }
-        true
+
+        should_allow_request
    }

-    /// Clean the map. Simple strategy: remove all entries. At worst, we'll
-    /// double the effective max_rps during the cleanup. But that way deletion
-    /// does not aquire mutex on each entry access.
+    /// Clean the map. Simple strategy: remove all entries in a random shard.
+    /// At worst, we'll double the effective max_rps during the cleanup.
+    /// But that way deletion does not aquire mutex on each entry access.
    pub fn do_gc(&self) {
        info!(
            "cleaning up endpoint rate limiter, current size = {}",
            self.map.len()
        );
-        self.map.clear();
+        let n = self.map.shards().len();
+        // this lock is ok as the periodic cycle of do_gc makes this very unlikely to collide
+        // (impossible, infact, unless we have 2048 threads)
+        let shard = self.rand.lock().unwrap().gen_range(0..n);
+        self.map.shards()[shard].write().clear();
    }
 }

@@ -124,7 +233,6 @@ pub struct Token<'t> {
 #[derive(Debug, Clone, Copy)]
 pub struct LimiterState {
    limit: usize,
-    available: usize,
    in_flight: usize,
 }

@@ -285,10 +393,10 @@ impl Limiter {
            }
            new_limit
        };
-        crate::proxy::RATE_LIMITER_LIMIT
+        crate::metrics::RATE_LIMITER_LIMIT
            .with_label_values(&["expected"])
            .set(new_limit as i64);
-        crate::proxy::RATE_LIMITER_LIMIT
+        crate::metrics::RATE_LIMITER_LIMIT
            .with_label_values(&["actual"])
            .set(actual_limit as i64);
        self.limits.store(new_limit, Ordering::Release);
@@ -302,11 +410,7 @@ impl Limiter {
    pub fn state(&self) -> LimiterState {
        let limit = self.limits.load(Ordering::Relaxed);
        let in_flight = self.in_flight.load(Ordering::Relaxed);
-        LimiterState {
-            limit,
-            available: limit.saturating_sub(in_flight),
-            in_flight,
-        }
+        LimiterState { limit, in_flight }
    }
 }

@@ -319,13 +423,6 @@ impl<'t> Token<'t> {
        }
    }

-    #[cfg(test)]
-    pub fn set_latency(&mut self, latency: Duration) {
-        use std::ops::Sub;
-
-        self.start = Instant::now().sub(latency);
-    }
-
    pub fn forget(&mut self) {
        if let Some(permit) = self.permit.take() {
            permit.forget();
@@ -344,10 +441,6 @@ impl LimiterState {
    pub fn limit(&self) -> usize {
        self.limit
    }
-    /// The amount of concurrency available to use.
-    pub fn available(&self) -> usize {
-        self.available
-    }
    /// The number of jobs in flight.
    pub fn in_flight(&self) -> usize {
        self.in_flight
@@ -377,7 +470,7 @@ impl reqwest_middleware::Middleware for Limiter {
                )
            })?;
        info!(duration = ?start.elapsed(), "waiting for token to connect to the control plane");
-        crate::proxy::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64());
+        crate::metrics::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64());
        match next.run(req, extensions).await {
            Ok(response) => {
                self.release(token, Some(Outcome::from_reqwest_response(&response)))
@@ -395,12 +488,16 @@ impl reqwest_middleware::Middleware for Limiter {

 #[cfg(test)]
 mod tests {
-    use std::{pin::pin, task::Context, time::Duration};
+    use std::{hash::BuildHasherDefault, pin::pin, task::Context, time::Duration};

    use futures::{task::noop_waker_ref, Future};
+    use rand::SeedableRng;
+    use rustc_hash::FxHasher;
+    use smol_str::SmolStr;
+    use tokio::time;

-    use super::{Limiter, Outcome};
-    use crate::rate_limiter::RateLimitAlgorithm;
+    use super::{EndpointRateLimiter, Limiter, Outcome};
+    use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm};

    #[tokio::test]
    async fn it_works() {
@@ -509,4 +606,105 @@ mod tests {
        limiter.release(token1, None).await;
        limiter.release(token2, None).await;
    }
+
+    #[test]
+    fn rate_bucket_rpi() {
+        let rate_bucket = RateBucketInfo::new(50, Duration::from_secs(5));
+        assert_eq!(rate_bucket.max_rpi, 50 * 5);
+
+        let rate_bucket = RateBucketInfo::new(50, Duration::from_millis(500));
+        assert_eq!(rate_bucket.max_rpi, 50 / 2);
+    }
+
+    #[test]
+    fn rate_bucket_parse() {
+        let rate_bucket: RateBucketInfo = "100@10s".parse().unwrap();
+        assert_eq!(rate_bucket.interval, Duration::from_secs(10));
+        assert_eq!(rate_bucket.max_rpi, 100 * 10);
+        assert_eq!(rate_bucket.to_string(), "100@10s");
+
+        let rate_bucket: RateBucketInfo = "100@1m".parse().unwrap();
+        assert_eq!(rate_bucket.interval, Duration::from_secs(60));
+        assert_eq!(rate_bucket.max_rpi, 100 * 60);
+        assert_eq!(rate_bucket.to_string(), "100@1m");
+    }
+
+    #[test]
+    fn default_rate_buckets() {
+        let mut defaults = RateBucketInfo::DEFAULT_SET;
+        RateBucketInfo::validate(&mut defaults[..]).unwrap();
+    }
+
+    #[test]
+    #[should_panic = "invalid endpoint RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)"]
+    fn rate_buckets_validate() {
+        let mut rates: Vec<RateBucketInfo> = ["300@1s", "10@10s"]
+            .into_iter()
+            .map(|s| s.parse().unwrap())
+            .collect();
+        RateBucketInfo::validate(&mut rates).unwrap();
+    }
+
+    #[tokio::test]
+    async fn test_rate_limits() {
+        let mut rates: Vec<RateBucketInfo> = ["100@1s", "20@30s"]
+            .into_iter()
+            .map(|s| s.parse().unwrap())
+            .collect();
+        RateBucketInfo::validate(&mut rates).unwrap();
+        let limiter = EndpointRateLimiter::new(Vec::leak(rates));
+
+        let endpoint = SmolStr::from("ep-my-endpoint-1234");
+
+        time::pause();
+
+        for _ in 0..100 {
+            assert!(limiter.check(endpoint.clone()));
+        }
+        // more connections fail
+        assert!(!limiter.check(endpoint.clone()));
+
+        // fail even after 500ms as it's in the same bucket
+        time::advance(time::Duration::from_millis(500)).await;
+        assert!(!limiter.check(endpoint.clone()));
+
+        // after a full 1s, 100 requests are allowed again
+        time::advance(time::Duration::from_millis(500)).await;
+        for _ in 1..6 {
+            for _ in 0..100 {
+                assert!(limiter.check(endpoint.clone()));
+            }
+            time::advance(time::Duration::from_millis(1000)).await;
+        }
+
+        // more connections after 600 will exceed the 20rps@30s limit
+        assert!(!limiter.check(endpoint.clone()));
+
+        // will still fail before the 30 second limit
+        time::advance(time::Duration::from_millis(30_000 - 6_000 - 1)).await;
+        assert!(!limiter.check(endpoint.clone()));
+
+        // after the full 30 seconds, 100 requests are allowed again
+        time::advance(time::Duration::from_millis(1)).await;
+        for _ in 0..100 {
+            assert!(limiter.check(endpoint.clone()));
+        }
+    }
+
+    #[tokio::test]
+    async fn test_rate_limits_gc() {
+        // fixed seeded random/hasher to ensure that the test is not flaky
+        let rand = rand::rngs::StdRng::from_seed([1; 32]);
+        let hasher = BuildHasherDefault::<FxHasher>::default();
+
+        let limiter = EndpointRateLimiter::new_with_rand_and_hasher(
+            &RateBucketInfo::DEFAULT_SET,
+            rand,
+            hasher,
+        );
+        for i in 0..1_000_000 {
+            limiter.check(format!("{i}").into());
+        }
+        assert!(limiter.map.len() < 150_000);
+    }
 }
--- a/proxy/src/scram/key.rs
+++ b/proxy/src/scram/key.rs
@@ -6,7 +6,7 @@ pub const SCRAM_KEY_LEN: usize = 32;
 /// One of the keys derived from the [password](super::password::SaltedPassword).
 /// We use the same structure for all keys, i.e.
 /// `ClientKey`, `StoredKey`, and `ServerKey`.
-#[derive(Default, PartialEq, Eq)]
+#[derive(Clone, Default, PartialEq, Eq)]
 #[repr(transparent)]
 pub struct ScramKey {
    bytes: [u8; SCRAM_KEY_LEN],
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -5,6 +5,7 @@ use super::key::ScramKey;

 /// Server secret is produced from [password](super::password::SaltedPassword)
 /// and is used throughout the authentication process.
+#[derive(Clone)]
 pub struct ServerSecret {
    /// Number of iterations for `PBKDF2` function.
    pub iterations: u32,
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -8,12 +8,13 @@ mod websocket;

 use anyhow::bail;
 use hyper::StatusCode;
+use metrics::IntCounterPairGuard;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;

+use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
-use crate::proxy::{NUM_CLIENT_CONNECTION_CLOSED_COUNTER, NUM_CLIENT_CONNECTION_OPENED_COUNTER};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::{cancellation::CancelMap, config::ProxyConfig};
 use futures::StreamExt;
@@ -38,13 +39,13 @@ pub async fn task_main(
    config: &'static ProxyConfig,
    ws_listener: TcpListener,
    cancellation_token: CancellationToken,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
    scopeguard::defer! {
        info!("websocket server has shut down");
    }

    let conn_pool = conn_pool::GlobalConnPool::new(config);
-    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(config.endpoint_rps_limit));

    // shutdown the connection pool
    tokio::spawn({
@@ -149,22 +150,17 @@ pub async fn task_main(

 struct MetricService<S> {
    inner: S,
+    _gauge: IntCounterPairGuard,
 }

 impl<S> MetricService<S> {
    fn new(inner: S) -> MetricService<S> {
-        NUM_CLIENT_CONNECTION_OPENED_COUNTER
-            .with_label_values(&["http"])
-            .inc();
-        MetricService { inner }
-    }
-}
-
-impl<S> Drop for MetricService<S> {
-    fn drop(&mut self) {
-        NUM_CLIENT_CONNECTION_CLOSED_COUNTER
-            .with_label_values(&["http"])
-            .inc();
+        MetricService {
+            inner,
+            _gauge: NUM_CLIENT_CONNECTION_GAUGE
+                .with_label_values(&["http"])
+                .guard(),
+        }
    }
 }

@@ -248,7 +244,7 @@ async fn request_handler(
            .header("Access-Control-Allow-Origin", "*")
            .header(
                "Access-Control-Allow-Headers",
-                "Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In",
+                "Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In, Neon-Batch-Read-Only, Neon-Batch-Isolation-Level",
            )
            .header("Access-Control-Max-Age", "86400" /* 24 hours */)
            .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
--- a/Show More
+++ b/Show More