only for console

proxy console force http2
2026-01-25 14:20:38 +00:00 · 2023-12-15 12:28:50 +00:00 · 2023-12-15 12:26:51 +00:00
72 changed files with 1239 additions and 2327 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -233,7 +233,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.32",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -244,7 +244,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.32",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -881,7 +881,7 @@ dependencies = [
 "regex",
 "rustc-hash",
 "shlex",
- "syn 2.0.32",
+ "syn 2.0.28",
 "which",
 ]

@@ -1095,7 +1095,7 @@ dependencies = [
 "heck",
 "proc-macro2",
 "quote",
- "syn 2.0.32",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -1245,19 +1245,16 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "async-trait",
 "camino",
 "clap",
 "comfy-table",
 "compute_api",
- "futures",
 "git-version",
 "hex",
 "hyper",
 "nix 0.26.2",
 "once_cell",
 "pageserver_api",
- "pageserver_client",
 "postgres",
 "postgres_backend",
 "postgres_connection",
@@ -1271,8 +1268,6 @@ dependencies = [
 "tar",
 "thiserror",
 "tokio",
- "tokio-postgres",
- "tokio-util",
 "toml",
 "tracing",
 "url",
@@ -1486,7 +1481,7 @@ dependencies = [
 "proc-macro2",
 "quote",
 "strsim",
- "syn 2.0.32",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -1497,7 +1492,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
 dependencies = [
 "darling_core",
 "quote",
- "syn 2.0.32",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -1572,7 +1567,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.32",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -1666,7 +1661,7 @@ dependencies = [
 "darling",
 "proc-macro2",
 "quote",
- "syn 2.0.32",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -1920,7 +1915,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.32",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -2906,7 +2901,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.32",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -3167,7 +3162,6 @@ dependencies = [
 "enum-map",
 "hex",
 "postgres_ffi",
- "rand 0.8.5",
 "serde",
 "serde_json",
 "serde_with",
@@ -3178,19 +3172,6 @@ dependencies = [
 "workspace_hack",
 ]

-[[package]]
-name = "pageserver_client"
-version = "0.1.0"
-dependencies = [
- "async-trait",
- "pageserver_api",
- "reqwest",
- "serde",
- "thiserror",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "parking"
 version = "2.1.1"
@@ -3350,7 +3331,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.32",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -3557,7 +3538,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
 dependencies = [
 "proc-macro2",
- "syn 2.0.32",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -4165,7 +4146,7 @@ dependencies = [
 "regex",
 "relative-path",
 "rustc_version",
- "syn 2.0.32",
+ "syn 2.0.28",
 "unicode-ident",
 ]

@@ -4311,7 +4292,6 @@ dependencies = [
 "histogram",
 "itertools",
 "pageserver",
- "pageserver_api",
 "rand 0.8.5",
 "remote_storage",
 "reqwest",
@@ -4600,7 +4580,7 @@ checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.32",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -4681,7 +4661,7 @@ dependencies = [
 "darling",
 "proc-macro2",
 "quote",
- "syn 2.0.32",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -4948,9 +4928,9 @@ dependencies = [

 [[package]]
 name = "syn"
-version = "2.0.32"
+version = "2.0.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2"
+checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -5080,7 +5060,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.32",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -5198,7 +5178,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.32",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -5499,7 +5479,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.32",
+ "syn 2.0.28",
 ]

 [[package]]
@@ -5944,7 +5924,7 @@ dependencies = [
 "once_cell",
 "proc-macro2",
 "quote",
- "syn 2.0.32",
+ "syn 2.0.28",
 "wasm-bindgen-shared",
 ]

@@ -5978,7 +5958,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.32",
+ "syn 2.0.28",
 "wasm-bindgen-backend",
 "wasm-bindgen-shared",
 ]
@@ -6315,7 +6295,7 @@ dependencies = [
 "smallvec",
 "subtle",
 "syn 1.0.109",
- "syn 2.0.32",
+ "syn 2.0.28",
 "time",
 "time-macros",
 "tokio",
@@ -6377,22 +6357,22 @@ dependencies = [

 [[package]]
 name = "zerocopy"
-version = "0.7.31"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1c4061bedbb353041c12f413700357bec76df2c7e2ca8e4df8bac24c6bf68e3d"
+checksum = "7a7af71d8643341260a65f89fa60c0eeaa907f34544d8f6d9b0df72f069b5e74"
 dependencies = [
 "zerocopy-derive",
 ]

 [[package]]
 name = "zerocopy-derive"
-version = "0.7.31"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a"
+checksum = "9731702e2f0617ad526794ae28fbc6f6ca8849b5ba729666c2a5bc4b6ddee2cd"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.32",
+ "syn 2.0.28",
 ]

 [[package]]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,7 +5,6 @@ members = [
    "control_plane",
    "pageserver",
    "pageserver/ctl",
-    "pageserver/client",
    "proxy",
    "safekeeper",
    "storage_broker",
@@ -183,7 +182,6 @@ compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
-pageserver_client = { path = "./pageserver/client" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -298,7 +298,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
                // from neon_superuser.
                let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -6,11 +6,9 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
-async-trait.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
-futures.workspace = true
 git-version.workspace = true
 nix.workspace = true
 once_cell.workspace = true
@@ -26,11 +24,10 @@ tar.workspace = true
 thiserror.workspace = true
 toml.workspace = true
 tokio.workspace = true
-tokio-postgres.workspace = true
-tokio-util.workspace = true
 url.workspace = true
+# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
+# instead, so that recompile times are better.
 pageserver_api.workspace = true
-pageserver_client.workspace = true
 postgres_backend.workspace = true
 safekeeper_api.workspace = true
 postgres_connection.workspace = true
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -9,7 +9,7 @@ pub struct AttachmentService {
    env: LocalEnv,
    listen: String,
    path: PathBuf,
-    client: reqwest::Client,
+    client: reqwest::blocking::Client,
 }

 const COMMAND: &str = "attachment_service";
@@ -53,7 +53,7 @@ impl AttachmentService {
            env: env.clone(),
            path,
            listen,
-            client: reqwest::ClientBuilder::new()
+            client: reqwest::blocking::ClientBuilder::new()
                .build()
                .expect("Failed to construct http client"),
        }
@@ -64,7 +64,7 @@ impl AttachmentService {
            .expect("non-Unicode path")
    }

-    pub async fn start(&self) -> anyhow::Result<Child> {
+    pub fn start(&self) -> anyhow::Result<Child> {
        let path_str = self.path.to_string_lossy();

        background_process::start_process(
@@ -73,11 +73,10 @@ impl AttachmentService {
            &self.env.attachment_service_bin(),
            ["-l", &self.listen, "-p", &path_str],
            [],
-            background_process::InitialPidFile::Create(self.pid_file()),
+            background_process::InitialPidFile::Create(&self.pid_file()),
            // TODO: a real status check
-            || async move { anyhow::Ok(true) },
+            || Ok(true),
        )
-        .await
    }

    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
@@ -85,7 +84,7 @@ impl AttachmentService {
    }

    /// Call into the attach_hook API, for use before handing out attachments to pageservers
-    pub async fn attach_hook(
+    pub fn attach_hook(
        &self,
        tenant_id: TenantId,
        pageserver_id: NodeId,
@@ -105,16 +104,16 @@ impl AttachmentService {
            node_id: Some(pageserver_id),
        };

-        let response = self.client.post(url).json(&request).send().await?;
+        let response = self.client.post(url).json(&request).send()?;
        if response.status() != StatusCode::OK {
            return Err(anyhow!("Unexpected status {}", response.status()));
        }

-        let response = response.json::<AttachHookResponse>().await?;
+        let response = response.json::<AttachHookResponse>()?;
        Ok(response.gen)
    }

-    pub async fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
+    pub fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
        use hyper::StatusCode;

        let url = self
@@ -127,12 +126,12 @@ impl AttachmentService {

        let request = InspectRequest { tenant_id };

-        let response = self.client.post(url).json(&request).send().await?;
+        let response = self.client.post(url).json(&request).send()?;
        if response.status() != StatusCode::OK {
            return Err(anyhow!("Unexpected status {}", response.status()));
        }

-        let response = response.json::<InspectResponse>().await?;
+        let response = response.json::<InspectResponse>()?;
        Ok(response.attachment)
    }
 }
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -44,15 +44,15 @@ const NOTICE_AFTER_RETRIES: u64 = 50;

 /// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
 /// it itself.
-pub enum InitialPidFile {
+pub enum InitialPidFile<'t> {
    /// Create a pidfile, to allow future CLI invocations to manipulate the process.
-    Create(Utf8PathBuf),
+    Create(&'t Utf8Path),
    /// The process will create the pidfile itself, need to wait for that event.
-    Expect(Utf8PathBuf),
+    Expect(&'t Utf8Path),
 }

 /// Start a background child process using the parameters given.
-pub async fn start_process<F, Fut, AI, A, EI>(
+pub fn start_process<F, AI, A, EI>(
    process_name: &str,
    datadir: &Path,
    command: &Path,
@@ -62,8 +62,7 @@ pub async fn start_process<F, Fut, AI, A, EI>(
    process_status_check: F,
 ) -> anyhow::Result<Child>
 where
-    F: Fn() -> Fut,
-    Fut: std::future::Future<Output = anyhow::Result<bool>>,
+    F: Fn() -> anyhow::Result<bool>,
    AI: IntoIterator<Item = A>,
    A: AsRef<OsStr>,
    // Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
@@ -90,7 +89,7 @@ where
    let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
    filled_cmd.envs(envs);

-    let pid_file_to_check = match &initial_pid_file {
+    let pid_file_to_check = match initial_pid_file {
        InitialPidFile::Create(path) => {
            pre_exec_create_pidfile(filled_cmd, path);
            path
@@ -108,7 +107,7 @@ where
    );

    for retries in 0..RETRIES {
-        match process_started(pid, pid_file_to_check, &process_status_check).await {
+        match process_started(pid, Some(pid_file_to_check), &process_status_check) {
            Ok(true) => {
                println!("\n{process_name} started, pid: {pid}");
                return Ok(spawned_process);
@@ -317,20 +316,22 @@ where
    cmd
 }

-async fn process_started<F, Fut>(
+fn process_started<F>(
    pid: Pid,
-    pid_file_to_check: &Utf8Path,
+    pid_file_to_check: Option<&Utf8Path>,
    status_check: &F,
 ) -> anyhow::Result<bool>
 where
-    F: Fn() -> Fut,
-    Fut: std::future::Future<Output = anyhow::Result<bool>>,
+    F: Fn() -> anyhow::Result<bool>,
 {
-    match status_check().await {
-        Ok(true) => match pid_file::read(pid_file_to_check)? {
-            PidFileRead::NotExist => Ok(false),
-            PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
-            PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
+    match status_check() {
+        Ok(true) => match pid_file_to_check {
+            Some(pid_file_path) => match pid_file::read(pid_file_path)? {
+                PidFileRead::NotExist => Ok(false),
+                PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
+                PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
+            },
+            None => Ok(true),
        },
        Ok(false) => Ok(false),
        Err(e) => anyhow::bail!("process failed to start: {e}"),
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -120,20 +120,15 @@ fn main() -> Result<()> {
        let mut env = LocalEnv::load_config().context("Error loading config")?;
        let original_env = env.clone();

-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .unwrap();
-
        let subcommand_result = match sub_name {
-            "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
-            "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
-            "start" => rt.block_on(handle_start_all(sub_args, &env)),
+            "tenant" => handle_tenant(sub_args, &mut env),
+            "timeline" => handle_timeline(sub_args, &mut env),
+            "start" => handle_start_all(sub_args, &env),
            "stop" => handle_stop_all(sub_args, &env),
-            "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
-            "attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
-            "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
-            "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
+            "pageserver" => handle_pageserver(sub_args, &env),
+            "attachment_service" => handle_attachment_service(sub_args, &env),
+            "safekeeper" => handle_safekeeper(sub_args, &env),
+            "endpoint" => handle_endpoint(sub_args, &env),
            "mappings" => handle_mappings(sub_args, &mut env),
            "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
            _ => bail!("unexpected subcommand {sub_name}"),
@@ -274,13 +269,12 @@ fn print_timeline(

 /// Returns a map of timeline IDs to timeline_id@lsn strings.
 /// Connects to the pageserver to query this information.
-async fn get_timeline_infos(
+fn get_timeline_infos(
    env: &local_env::LocalEnv,
    tenant_id: &TenantId,
 ) -> Result<HashMap<TimelineId, TimelineInfo>> {
    Ok(get_default_pageserver(env)
-        .timeline_list(tenant_id)
-        .await?
+        .timeline_list(tenant_id)?
        .into_iter()
        .map(|timeline_info| (timeline_info.timeline_id, timeline_info))
        .collect())
@@ -379,14 +373,11 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
        .collect()
 }

-async fn handle_tenant(
-    tenant_match: &ArgMatches,
-    env: &mut local_env::LocalEnv,
-) -> anyhow::Result<()> {
+fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> {
    let pageserver = get_default_pageserver(env);
    match tenant_match.subcommand() {
        Some(("list", _)) => {
-            for t in pageserver.tenant_list().await? {
+            for t in pageserver.tenant_list()? {
                println!("{} {:?}", t.id, t.state);
            }
        }
@@ -403,16 +394,12 @@ async fn handle_tenant(
                // We must register the tenant with the attachment service, so
                // that when the pageserver restarts, it will be re-attached.
                let attachment_service = AttachmentService::from_env(env);
-                attachment_service
-                    .attach_hook(tenant_id, pageserver.conf.id)
-                    .await?
+                attachment_service.attach_hook(tenant_id, pageserver.conf.id)?
            } else {
                None
            };

-            pageserver
-                .tenant_create(tenant_id, generation, tenant_conf)
-                .await?;
+            pageserver.tenant_create(tenant_id, generation, tenant_conf)?;
            println!("tenant {tenant_id} successfully created on the pageserver");

            // Create an initial timeline for the new tenant
@@ -422,16 +409,14 @@ async fn handle_tenant(
                .copied()
                .context("Failed to parse postgres version from the argument string")?;

-            let timeline_info = pageserver
-                .timeline_create(
-                    tenant_id,
-                    new_timeline_id,
-                    None,
-                    None,
-                    Some(pg_version),
-                    None,
-                )
-                .await?;
+            let timeline_info = pageserver.timeline_create(
+                tenant_id,
+                new_timeline_id,
+                None,
+                None,
+                Some(pg_version),
+                None,
+            )?;
            let new_timeline_id = timeline_info.timeline_id;
            let last_record_lsn = timeline_info.last_record_lsn;

@@ -465,7 +450,6 @@ async fn handle_tenant(

            pageserver
                .tenant_config(tenant_id, tenant_conf)
-                .await
                .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
            println!("tenant {tenant_id} successfully configured on the pageserver");
        }
@@ -474,7 +458,7 @@ async fn handle_tenant(
            let new_pageserver = get_pageserver(env, matches)?;
            let new_pageserver_id = new_pageserver.conf.id;

-            migrate_tenant(env, tenant_id, new_pageserver).await?;
+            migrate_tenant(env, tenant_id, new_pageserver)?;
            println!("tenant {tenant_id} migrated to {}", new_pageserver_id);
        }

@@ -484,13 +468,13 @@ async fn handle_tenant(
    Ok(())
 }

-async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
+fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
    let pageserver = get_default_pageserver(env);

    match timeline_match.subcommand() {
        Some(("list", list_match)) => {
            let tenant_id = get_tenant_id(list_match, env)?;
-            let timelines = pageserver.timeline_list(&tenant_id).await?;
+            let timelines = pageserver.timeline_list(&tenant_id)?;
            print_timelines_tree(timelines, env.timeline_name_mappings())?;
        }
        Some(("create", create_match)) => {
@@ -506,16 +490,14 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local

            let new_timeline_id_opt = parse_timeline_id(create_match)?;

-            let timeline_info = pageserver
-                .timeline_create(
-                    tenant_id,
-                    new_timeline_id_opt,
-                    None,
-                    None,
-                    Some(pg_version),
-                    None,
-                )
-                .await?;
+            let timeline_info = pageserver.timeline_create(
+                tenant_id,
+                new_timeline_id_opt,
+                None,
+                None,
+                Some(pg_version),
+                None,
+            )?;
            let new_timeline_id = timeline_info.timeline_id;

            let last_record_lsn = timeline_info.last_record_lsn;
@@ -560,9 +542,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local

            let mut cplane = ComputeControlPlane::load(env.clone())?;
            println!("Importing timeline into pageserver ...");
-            pageserver
-                .timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)
-                .await?;
+            pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)?;
            env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?;

            println!("Creating endpoint for imported timeline ...");
@@ -598,16 +578,14 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                .map(|lsn_str| Lsn::from_str(lsn_str))
                .transpose()
                .context("Failed to parse ancestor start Lsn from the request")?;
-            let timeline_info = pageserver
-                .timeline_create(
-                    tenant_id,
-                    None,
-                    start_lsn,
-                    Some(ancestor_timeline_id),
-                    None,
-                    None,
-                )
-                .await?;
+            let timeline_info = pageserver.timeline_create(
+                tenant_id,
+                None,
+                start_lsn,
+                Some(ancestor_timeline_id),
+                None,
+                None,
+            )?;
            let new_timeline_id = timeline_info.timeline_id;

            let last_record_lsn = timeline_info.last_record_lsn;
@@ -626,7 +604,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
    Ok(())
 }

-async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    let (sub_name, sub_args) = match ep_match.subcommand() {
        Some(ep_subcommand_data) => ep_subcommand_data,
        None => bail!("no endpoint subcommand provided"),
@@ -636,12 +614,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
    match sub_name {
        "list" => {
            let tenant_id = get_tenant_id(sub_args, env)?;
-            let timeline_infos = get_timeline_infos(env, &tenant_id)
-                .await
-                .unwrap_or_else(|e| {
-                    eprintln!("Failed to load timeline info: {}", e);
-                    HashMap::new()
-                });
+            let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| {
+                eprintln!("Failed to load timeline info: {}", e);
+                HashMap::new()
+            });

            let timeline_name_mappings = env.timeline_name_mappings();

@@ -815,9 +791,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
            };

            println!("Starting existing endpoint {endpoint_id}...");
-            endpoint
-                .start(&auth_token, safekeepers, remote_ext_config)
-                .await?;
+            endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
        }
        "reconfigure" => {
            let endpoint_id = sub_args
@@ -835,7 +809,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                } else {
                    None
                };
-            endpoint.reconfigure(pageserver_id).await?;
+            endpoint.reconfigure(pageserver_id)?;
        }
        "stop" => {
            let endpoint_id = sub_args
@@ -901,12 +875,11 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
    ))
 }

-async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    match sub_match.subcommand() {
        Some(("start", subcommand_args)) => {
            if let Err(e) = get_pageserver(env, subcommand_args)?
                .start(&pageserver_config_overrides(subcommand_args))
-                .await
            {
                eprintln!("pageserver start failed: {e}");
                exit(1);
@@ -933,10 +906,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
                exit(1);
            }

-            if let Err(e) = pageserver
-                .start(&pageserver_config_overrides(subcommand_args))
-                .await
-            {
+            if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
@@ -950,17 +920,14 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
                exit(1);
            }

-            if let Err(e) = pageserver
-                .start(&pageserver_config_overrides(subcommand_args))
-                .await
-            {
+            if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
        }

        Some(("status", subcommand_args)) => {
-            match get_pageserver(env, subcommand_args)?.check_status().await {
+            match get_pageserver(env, subcommand_args)?.check_status() {
                Ok(_) => println!("Page server is up and running"),
                Err(err) => {
                    eprintln!("Page server is not available: {}", err);
@@ -975,14 +942,11 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
    Ok(())
 }

-async fn handle_attachment_service(
-    sub_match: &ArgMatches,
-    env: &local_env::LocalEnv,
-) -> Result<()> {
+fn handle_attachment_service(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    let svc = AttachmentService::from_env(env);
    match sub_match.subcommand() {
        Some(("start", _start_match)) => {
-            if let Err(e) = svc.start().await {
+            if let Err(e) = svc.start() {
                eprintln!("start failed: {e}");
                exit(1);
            }
@@ -1023,7 +987,7 @@ fn safekeeper_extra_opts(init_match: &ArgMatches) -> Vec<String> {
        .collect()
 }

-async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    let (sub_name, sub_args) = match sub_match.subcommand() {
        Some(safekeeper_command_data) => safekeeper_command_data,
        None => bail!("no safekeeper subcommand provided"),
@@ -1041,7 +1005,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
        "start" => {
            let extra_opts = safekeeper_extra_opts(sub_args);

-            if let Err(e) = safekeeper.start(extra_opts).await {
+            if let Err(e) = safekeeper.start(extra_opts) {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -1067,7 +1031,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
            }

            let extra_opts = safekeeper_extra_opts(sub_args);
-            if let Err(e) = safekeeper.start(extra_opts).await {
+            if let Err(e) = safekeeper.start(extra_opts) {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -1080,15 +1044,15 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
    Ok(())
 }

-async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
+fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
    // Endpoints are not started automatically

-    broker::start_broker_process(env).await?;
+    broker::start_broker_process(env)?;

    // Only start the attachment service if the pageserver is configured to need it
    if env.control_plane_api.is_some() {
        let attachment_service = AttachmentService::from_env(env);
-        if let Err(e) = attachment_service.start().await {
+        if let Err(e) = attachment_service.start() {
            eprintln!("attachment_service start failed: {:#}", e);
            try_stop_all(env, true);
            exit(1);
@@ -1097,10 +1061,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->

    for ps_conf in &env.pageservers {
        let pageserver = PageServerNode::from_env(env, ps_conf);
-        if let Err(e) = pageserver
-            .start(&pageserver_config_overrides(sub_match))
-            .await
-        {
+        if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
            try_stop_all(env, true);
            exit(1);
@@ -1109,7 +1070,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->

    for node in env.safekeepers.iter() {
        let safekeeper = SafekeeperNode::from_env(env, node);
-        if let Err(e) = safekeeper.start(vec![]).await {
+        if let Err(e) = safekeeper.start(vec![]) {
            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
            try_stop_all(env, false);
            exit(1);
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -11,7 +11,7 @@ use camino::Utf8PathBuf;

 use crate::{background_process, local_env};

-pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
+pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
    let broker = &env.broker;
    let listen_addr = &broker.listen_addr;

@@ -19,15 +19,15 @@ pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<(

    let args = [format!("--listen-addr={listen_addr}")];

-    let client = reqwest::Client::new();
+    let client = reqwest::blocking::Client::new();
    background_process::start_process(
        "storage_broker",
        &env.base_data_dir,
        &env.storage_broker_bin(),
        args,
        [],
-        background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)),
-        || async {
+        background_process::InitialPidFile::Create(&storage_broker_pid_file_path(env)),
+        || {
            let url = broker.client_url();
            let status_url = url.join("status").with_context(|| {
                format!("Failed to append /status path to broker endpoint {url}")
@@ -36,13 +36,12 @@ pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<(
                .get(status_url)
                .build()
                .with_context(|| format!("Failed to construct request to broker endpoint {url}"))?;
-            match client.execute(request).await {
+            match client.execute(request) {
                Ok(resp) => Ok(resp.status().is_success()),
                Err(_) => Ok(false),
            }
        },
    )
-    .await
    .context("Failed to spawn storage_broker subprocess")?;
    Ok(())
 }
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -464,7 +464,7 @@ impl Endpoint {
        }
    }

-    pub async fn start(
+    pub fn start(
        &self,
        auth_token: &Option<String>,
        safekeepers: Vec<NodeId>,
@@ -587,7 +587,7 @@ impl Endpoint {
        const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s
        loop {
            attempt += 1;
-            match self.get_status().await {
+            match self.get_status() {
                Ok(state) => {
                    match state.status {
                        ComputeStatus::Init => {
@@ -629,8 +629,8 @@ impl Endpoint {
    }

    // Call the /status HTTP API
-    pub async fn get_status(&self) -> Result<ComputeState> {
-        let client = reqwest::Client::new();
+    pub fn get_status(&self) -> Result<ComputeState> {
+        let client = reqwest::blocking::Client::new();

        let response = client
            .request(
@@ -641,17 +641,16 @@ impl Endpoint {
                    self.http_address.port()
                ),
            )
-            .send()
-            .await?;
+            .send()?;

        // Interpret the response
        let status = response.status();
        if !(status.is_client_error() || status.is_server_error()) {
-            Ok(response.json().await?)
+            Ok(response.json()?)
        } else {
            // reqwest does not export its error construction utility functions, so let's craft the message ourselves
            let url = response.url().to_owned();
-            let msg = match response.text().await {
+            let msg = match response.text() {
                Ok(err_body) => format!("Error: {}", err_body),
                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
            };
@@ -659,7 +658,7 @@ impl Endpoint {
        }
    }

-    pub async fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
+    pub fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
        let mut spec: ComputeSpec = {
            let spec_path = self.endpoint_path().join("spec.json");
            let file = std::fs::File::open(spec_path)?;
@@ -688,7 +687,7 @@ impl Endpoint {
            spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
        }

-        let client = reqwest::Client::new();
+        let client = reqwest::blocking::Client::new();
        let response = client
            .post(format!(
                "http://{}:{}/configure",
@@ -699,15 +698,14 @@ impl Endpoint {
                "{{\"spec\":{}}}",
                serde_json::to_string_pretty(&spec)?
            ))
-            .send()
-            .await?;
+            .send()?;

        let status = response.status();
        if !(status.is_client_error() || status.is_server_error()) {
            Ok(())
        } else {
            let url = response.url().to_owned();
-            let msg = match response.text().await {
+            let msg = match response.text() {
                Ok(err_body) => format!("Error: {}", err_body),
                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
            };
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -6,24 +6,28 @@
 //!
 use std::borrow::Cow;
 use std::collections::HashMap;
-
-use std::io;
-use std::io::Write;
+use std::fs::File;
+use std::io::{BufReader, Write};
 use std::num::NonZeroU64;
 use std::path::PathBuf;
 use std::process::{Child, Command};
 use std::time::Duration;
+use std::{io, result};

 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
-use futures::SinkExt;
-use pageserver_api::models::{self, LocationConfig, TenantInfo, TimelineInfo};
+use pageserver_api::models::{
+    self, LocationConfig, TenantInfo, TenantLocationConfigRequest, TimelineInfo,
+};
 use pageserver_api::shard::TenantShardId;
-use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
+use reqwest::blocking::{Client, RequestBuilder, Response};
+use reqwest::{IntoUrl, Method};
+use thiserror::Error;
 use utils::auth::{Claims, Scope};
 use utils::{
+    http::error::HttpErrorBody,
    id::{TenantId, TimelineId},
    lsn::Lsn,
 };
@@ -34,6 +38,45 @@ use crate::{background_process, local_env::LocalEnv};
 /// Directory within .neon which will be used by default for LocalFs remote storage.
 pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver";

+#[derive(Error, Debug)]
+pub enum PageserverHttpError {
+    #[error("Reqwest error: {0}")]
+    Transport(#[from] reqwest::Error),
+
+    #[error("Error: {0}")]
+    Response(String),
+}
+
+impl From<anyhow::Error> for PageserverHttpError {
+    fn from(e: anyhow::Error) -> Self {
+        Self::Response(e.to_string())
+    }
+}
+
+type Result<T> = result::Result<T, PageserverHttpError>;
+
+pub trait ResponseErrorMessageExt: Sized {
+    fn error_from_body(self) -> Result<Self>;
+}
+
+impl ResponseErrorMessageExt for Response {
+    fn error_from_body(self) -> Result<Self> {
+        let status = self.status();
+        if !(status.is_client_error() || status.is_server_error()) {
+            return Ok(self);
+        }
+
+        // reqwest does not export its error construction utility functions, so let's craft the message ourselves
+        let url = self.url().to_owned();
+        Err(PageserverHttpError::Response(
+            match self.json::<HttpErrorBody>() {
+                Ok(err_body) => format!("Error: {}", err_body.msg),
+                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
+            },
+        ))
+    }
+}
+
 //
 // Control routines for pageserver.
 //
@@ -44,7 +87,8 @@ pub struct PageServerNode {
    pub pg_connection_config: PgConnectionConfig,
    pub conf: PageServerConf,
    pub env: LocalEnv,
-    pub http_client: mgmt_api::Client,
+    pub http_client: Client,
+    pub http_base_url: String,
 }

 impl PageServerNode {
@@ -56,19 +100,8 @@ impl PageServerNode {
            pg_connection_config: PgConnectionConfig::new_host_port(host, port),
            conf: conf.clone(),
            env: env.clone(),
-            http_client: mgmt_api::Client::new(
-                format!("http://{}", conf.listen_http_addr),
-                {
-                    match conf.http_auth_type {
-                        AuthType::Trust => None,
-                        AuthType::NeonJWT => Some(
-                            env.generate_auth_token(&Claims::new(None, Scope::PageServerApi))
-                                .unwrap(),
-                        ),
-                    }
-                }
-                .as_deref(),
-            ),
+            http_client: Client::new(),
+            http_base_url: format!("http://{}/v1", conf.listen_http_addr),
        }
    }

@@ -149,8 +182,8 @@ impl PageServerNode {
            .expect("non-Unicode path")
    }

-    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
-        self.start_node(config_overrides, false).await
+    pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
+        self.start_node(config_overrides, false)
    }

    fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
@@ -191,12 +224,7 @@ impl PageServerNode {
        Ok(())
    }

-    async fn start_node(
-        &self,
-        config_overrides: &[&str],
-        update_config: bool,
-    ) -> anyhow::Result<Child> {
-        // TODO: using a thread here because start_process() is not async but we need to call check_status()
+    fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result<Child> {
        let datadir = self.repo_path();
        print!(
            "Starting pageserver node {} at '{}' in {:?}",
@@ -204,7 +232,7 @@ impl PageServerNode {
            self.pg_connection_config.raw_address(),
            datadir
        );
-        io::stdout().flush().context("flush stdout")?;
+        io::stdout().flush()?;

        let datadir_path_str = datadir.to_str().with_context(|| {
            format!(
@@ -216,23 +244,20 @@ impl PageServerNode {
        if update_config {
            args.push(Cow::Borrowed("--update-config"));
        }
+
        background_process::start_process(
            "pageserver",
            &datadir,
            &self.env.pageserver_bin(),
            args.iter().map(Cow::as_ref),
            self.pageserver_env_variables()?,
-            background_process::InitialPidFile::Expect(self.pid_file()),
-            || async {
-                let st = self.check_status().await;
-                match st {
-                    Ok(()) => Ok(true),
-                    Err(mgmt_api::Error::ReceiveBody(_)) => Ok(false),
-                    Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
-                }
+            background_process::InitialPidFile::Expect(&self.pid_file()),
+            || match self.check_status() {
+                Ok(()) => Ok(true),
+                Err(PageserverHttpError::Transport(_)) => Ok(false),
+                Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
            },
        )
-        .await
    }

    fn pageserver_basic_args<'a>(
@@ -278,12 +303,7 @@ impl PageServerNode {
        background_process::stop_process(immediate, "pageserver", &self.pid_file())
    }

-    pub async fn page_server_psql_client(
-        &self,
-    ) -> anyhow::Result<(
-        tokio_postgres::Client,
-        tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
-    )> {
+    pub fn page_server_psql_client(&self) -> anyhow::Result<postgres::Client> {
        let mut config = self.pg_connection_config.clone();
        if self.conf.pg_auth_type == AuthType::NeonJWT {
            let token = self
@@ -291,18 +311,36 @@ impl PageServerNode {
                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
            config = config.set_password(Some(token));
        }
-        Ok(config.connect_no_tls().await?)
+        Ok(config.connect_no_tls()?)
    }

-    pub async fn check_status(&self) -> mgmt_api::Result<()> {
-        self.http_client.status().await
+    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> anyhow::Result<RequestBuilder> {
+        let mut builder = self.http_client.request(method, url);
+        if self.conf.http_auth_type == AuthType::NeonJWT {
+            let token = self
+                .env
+                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
+            builder = builder.bearer_auth(token)
+        }
+        Ok(builder)
    }

-    pub async fn tenant_list(&self) -> mgmt_api::Result<Vec<TenantInfo>> {
-        self.http_client.list_tenants().await
+    pub fn check_status(&self) -> Result<()> {
+        self.http_request(Method::GET, format!("{}/status", self.http_base_url))?
+            .send()?
+            .error_from_body()?;
+        Ok(())
    }

-    pub async fn tenant_create(
+    pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
+        Ok(self
+            .http_request(Method::GET, format!("{}/tenant", self.http_base_url))?
+            .send()?
+            .error_from_body()?
+            .json()?)
+    }
+
+    pub fn tenant_create(
        &self,
        new_tenant_id: TenantId,
        generation: Option<u32>,
@@ -380,10 +418,23 @@ impl PageServerNode {
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
        }
-        Ok(self.http_client.tenant_create(&request).await?)
+        self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))?
+            .json(&request)
+            .send()?
+            .error_from_body()?
+            .json::<Option<String>>()
+            .with_context(|| {
+                format!("Failed to parse tenant creation response for tenant id: {new_tenant_id:?}")
+            })?
+            .context("No tenant id was found in the tenant creation response")
+            .and_then(|tenant_id_string| {
+                tenant_id_string.parse().with_context(|| {
+                    format!("Failed to parse response string as tenant id: '{tenant_id_string}'")
+                })
+            })
    }

-    pub async fn tenant_config(
+    pub fn tenant_config(
        &self,
        tenant_id: TenantId,
        mut settings: HashMap<&str, &str>,
@@ -462,30 +513,54 @@ impl PageServerNode {
            bail!("Unrecognized tenant settings: {settings:?}")
        }

-        self.http_client
-            .tenant_config(&models::TenantConfigRequest { tenant_id, config })
-            .await?;
+        self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))?
+            .json(&models::TenantConfigRequest { tenant_id, config })
+            .send()?
+            .error_from_body()?;

        Ok(())
    }

-    pub async fn location_config(
+    pub fn location_config(
        &self,
        tenant_id: TenantId,
        config: LocationConfig,
        flush_ms: Option<Duration>,
    ) -> anyhow::Result<()> {
-        Ok(self
-            .http_client
-            .location_config(tenant_id, config, flush_ms)
-            .await?)
+        let req_body = TenantLocationConfigRequest { tenant_id, config };
+
+        let path = format!(
+            "{}/tenant/{}/location_config",
+            self.http_base_url, tenant_id
+        );
+        let path = if let Some(flush_ms) = flush_ms {
+            format!("{}?flush_ms={}", path, flush_ms.as_millis())
+        } else {
+            path
+        };
+
+        self.http_request(Method::PUT, path)?
+            .json(&req_body)
+            .send()?
+            .error_from_body()?;
+
+        Ok(())
    }

-    pub async fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
-        Ok(self.http_client.list_timelines(*tenant_id).await?)
+    pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
+        let timeline_infos: Vec<TimelineInfo> = self
+            .http_request(
+                Method::GET,
+                format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
+            )?
+            .send()?
+            .error_from_body()?
+            .json()?;
+
+        Ok(timeline_infos)
    }

-    pub async fn timeline_create(
+    pub fn timeline_create(
        &self,
        tenant_id: TenantId,
        new_timeline_id: Option<TimelineId>,
@@ -496,14 +571,29 @@ impl PageServerNode {
    ) -> anyhow::Result<TimelineInfo> {
        // If timeline ID was not specified, generate one
        let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
-        let req = models::TimelineCreateRequest {
+
+        self.http_request(
+            Method::POST,
+            format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
+        )?
+        .json(&models::TimelineCreateRequest {
            new_timeline_id,
            ancestor_start_lsn,
            ancestor_timeline_id,
            pg_version,
            existing_initdb_timeline_id,
-        };
-        Ok(self.http_client.timeline_create(tenant_id, &req).await?)
+        })
+        .send()?
+        .error_from_body()?
+        .json::<Option<TimelineInfo>>()
+        .with_context(|| {
+            format!("Failed to parse timeline creation response for tenant id: {tenant_id}")
+        })?
+        .with_context(|| {
+            format!(
+                "No timeline id was found in the timeline creation response for tenant {tenant_id}"
+            )
+        })
    }

    /// Import a basebackup prepared using either:
@@ -515,7 +605,7 @@ impl PageServerNode {
    /// * `timeline_id` - id to assign to imported timeline
    /// * `base` - (start lsn of basebackup, path to `base.tar` file)
    /// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`)
-    pub async fn timeline_import(
+    pub fn timeline_import(
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
@@ -523,60 +613,36 @@ impl PageServerNode {
        pg_wal: Option<(Lsn, PathBuf)>,
        pg_version: u32,
    ) -> anyhow::Result<()> {
-        let (client, conn) = self.page_server_psql_client().await?;
-        // The connection object performs the actual communication with the database,
-        // so spawn it off to run on its own.
-        tokio::spawn(async move {
-            if let Err(e) = conn.await {
-                eprintln!("connection error: {}", e);
-            }
-        });
-        tokio::pin!(client);
+        let mut client = self.page_server_psql_client()?;

        // Init base reader
        let (start_lsn, base_tarfile_path) = base;
-        let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
-        let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);
+        let base_tarfile = File::open(base_tarfile_path)?;
+        let mut base_reader = BufReader::new(base_tarfile);

        // Init wal reader if necessary
        let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
-            let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
-            let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
+            let wal_tarfile = File::open(wal_tarfile_path)?;
+            let wal_reader = BufReader::new(wal_tarfile);
            (end_lsn, Some(wal_reader))
        } else {
            (start_lsn, None)
        };

-        let copy_in = |reader, cmd| {
-            let client = &client;
-            async move {
-                let writer = client.copy_in(&cmd).await?;
-                let writer = std::pin::pin!(writer);
-                let mut writer = writer.sink_map_err(|e| {
-                    std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
-                });
-                let mut reader = std::pin::pin!(reader);
-                writer.send_all(&mut reader).await?;
-                writer.into_inner().finish().await?;
-                anyhow::Ok(())
-            }
-        };
-
        // Import base
-        copy_in(
-            base_tarfile,
-            format!(
-                "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
-            ),
-        )
-        .await?;
+        let import_cmd = format!(
+            "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
+        );
+        let mut writer = client.copy_in(&import_cmd)?;
+        io::copy(&mut base_reader, &mut writer)?;
+        writer.finish()?;
+
        // Import wal if necessary
-        if let Some(wal_reader) = wal_reader {
-            copy_in(
-                wal_reader,
-                format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
-            )
-            .await?;
+        if let Some(mut wal_reader) = wal_reader {
+            let import_cmd = format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}");
+            let mut writer = client.copy_in(&import_cmd)?;
+            io::copy(&mut wal_reader, &mut writer)?;
+            writer.finish()?;
        }

        Ok(())
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -13,6 +13,7 @@ use std::{io, result};
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use postgres_connection::PgConnectionConfig;
+use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
 use utils::{http::error::HttpErrorBody, id::NodeId};
@@ -33,14 +34,12 @@ pub enum SafekeeperHttpError {

 type Result<T> = result::Result<T, SafekeeperHttpError>;

-#[async_trait::async_trait]
 pub trait ResponseErrorMessageExt: Sized {
-    async fn error_from_body(self) -> Result<Self>;
+    fn error_from_body(self) -> Result<Self>;
 }

-#[async_trait::async_trait]
-impl ResponseErrorMessageExt for reqwest::Response {
-    async fn error_from_body(self) -> Result<Self> {
+impl ResponseErrorMessageExt for Response {
+    fn error_from_body(self) -> Result<Self> {
        let status = self.status();
        if !(status.is_client_error() || status.is_server_error()) {
            return Ok(self);
@@ -49,7 +48,7 @@ impl ResponseErrorMessageExt for reqwest::Response {
        // reqwest does not export its error construction utility functions, so let's craft the message ourselves
        let url = self.url().to_owned();
        Err(SafekeeperHttpError::Response(
-            match self.json::<HttpErrorBody>().await {
+            match self.json::<HttpErrorBody>() {
                Ok(err_body) => format!("Error: {}", err_body.msg),
                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
            },
@@ -70,7 +69,7 @@ pub struct SafekeeperNode {

    pub pg_connection_config: PgConnectionConfig,
    pub env: LocalEnv,
-    pub http_client: reqwest::Client,
+    pub http_client: Client,
    pub http_base_url: String,
 }

@@ -81,7 +80,7 @@ impl SafekeeperNode {
            conf: conf.clone(),
            pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
            env: env.clone(),
-            http_client: reqwest::Client::new(),
+            http_client: Client::new(),
            http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
        }
    }
@@ -104,7 +103,7 @@ impl SafekeeperNode {
            .expect("non-Unicode path")
    }

-    pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
+    pub fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
        print!(
            "Starting safekeeper at '{}' in '{}'",
            self.pg_connection_config.raw_address(),
@@ -192,16 +191,13 @@ impl SafekeeperNode {
            &self.env.safekeeper_bin(),
            &args,
            [],
-            background_process::InitialPidFile::Expect(self.pid_file()),
-            || async {
-                match self.check_status().await {
-                    Ok(()) => Ok(true),
-                    Err(SafekeeperHttpError::Transport(_)) => Ok(false),
-                    Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
-                }
+            background_process::InitialPidFile::Expect(&self.pid_file()),
+            || match self.check_status() {
+                Ok(()) => Ok(true),
+                Err(SafekeeperHttpError::Transport(_)) => Ok(false),
+                Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
            },
        )
-        .await
    }

    ///
@@ -220,7 +216,7 @@ impl SafekeeperNode {
        )
    }

-    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> reqwest::RequestBuilder {
+    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
        // TODO: authentication
        //if self.env.auth_type == AuthType::NeonJWT {
        //    builder = builder.bearer_auth(&self.env.safekeeper_auth_token)
@@ -228,12 +224,10 @@ impl SafekeeperNode {
        self.http_client.request(method, url)
    }

-    pub async fn check_status(&self) -> Result<()> {
+    pub fn check_status(&self) -> Result<()> {
        self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
-            .send()
-            .await?
-            .error_from_body()
-            .await?;
+            .send()?
+            .error_from_body()?;
        Ok(())
    }
 }
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -19,11 +19,11 @@ use utils::{
 };

 /// Given an attached pageserver, retrieve the LSN for all timelines
-async fn get_lsns(
+fn get_lsns(
    tenant_id: TenantId,
    pageserver: &PageServerNode,
 ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
-    let timelines = pageserver.timeline_list(&tenant_id).await?;
+    let timelines = pageserver.timeline_list(&tenant_id)?;
    Ok(timelines
        .into_iter()
        .map(|t| (t.timeline_id, t.last_record_lsn))
@@ -32,13 +32,13 @@ async fn get_lsns(

 /// Wait for the timeline LSNs on `pageserver` to catch up with or overtake
 /// `baseline`.
-async fn await_lsn(
+fn await_lsn(
    tenant_id: TenantId,
    pageserver: &PageServerNode,
    baseline: HashMap<TimelineId, Lsn>,
 ) -> anyhow::Result<()> {
    loop {
-        let latest = match get_lsns(tenant_id, pageserver).await {
+        let latest = match get_lsns(tenant_id, pageserver) {
            Ok(l) => l,
            Err(e) => {
                println!(
@@ -84,7 +84,7 @@ async fn await_lsn(
 ///  - Coordinate attach/secondary/detach on pageservers
 ///  - call into attachment_service for generations
 ///  - reconfigure compute endpoints to point to new attached pageserver
-pub async fn migrate_tenant(
+pub fn migrate_tenant(
    env: &LocalEnv,
    tenant_id: TenantId,
    dest_ps: PageServerNode,
@@ -108,18 +108,16 @@ pub async fn migrate_tenant(
        }
    }

-    let previous = attachment_service.inspect(tenant_id).await?;
+    let previous = attachment_service.inspect(tenant_id)?;
    let mut baseline_lsns = None;
    if let Some((generation, origin_ps_id)) = &previous {
        let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?);

        if origin_ps_id == &dest_ps.conf.id {
            println!("🔁 Already attached to {origin_ps_id}, freshening...");
-            let gen = attachment_service
-                .attach_hook(tenant_id, dest_ps.conf.id)
-                .await?;
+            let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
            let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
-            dest_ps.location_config(tenant_id, dest_conf, None).await?;
+            dest_ps.location_config(tenant_id, dest_conf, None)?;
            println!("✅ Migration complete");
            return Ok(());
        }
@@ -128,24 +126,20 @@ pub async fn migrate_tenant(

        let stale_conf =
            build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
-        origin_ps
-            .location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))
-            .await?;
+        origin_ps.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))?;

-        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
+        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
    }

-    let gen = attachment_service
-        .attach_hook(tenant_id, dest_ps.conf.id)
-        .await?;
+    let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
    let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);

    println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
-    dest_ps.location_config(tenant_id, dest_conf, None).await?;
+    dest_ps.location_config(tenant_id, dest_conf, None)?;

    if let Some(baseline) = baseline_lsns {
        println!("🕑 Waiting for LSN to catch up...");
-        await_lsn(tenant_id, &dest_ps, baseline).await?;
+        await_lsn(tenant_id, &dest_ps, baseline)?;
    }

    let cplane = ComputeControlPlane::load(env.clone())?;
@@ -155,7 +149,7 @@ pub async fn migrate_tenant(
                "🔁 Reconfiguring endpoint {} to use pageserver {}",
                endpoint_name, dest_ps.conf.id
            );
-            endpoint.reconfigure(Some(dest_ps.conf.id)).await?;
+            endpoint.reconfigure(Some(dest_ps.conf.id))?;
        }
    }

@@ -165,7 +159,7 @@ pub async fn migrate_tenant(
        }

        let other_ps = PageServerNode::from_env(env, other_ps_conf);
-        let other_ps_tenants = other_ps.tenant_list().await?;
+        let other_ps_tenants = other_ps.tenant_list()?;

        // Check if this tenant is attached
        let found = other_ps_tenants
@@ -187,9 +181,7 @@ pub async fn migrate_tenant(
            "💤 Switching to secondary mode on pageserver {}",
            other_ps.conf.id
        );
-        other_ps
-            .location_config(tenant_id, secondary_conf, None)
-            .await?;
+        other_ps.location_config(tenant_id, secondary_conf, None)?;
    }

    println!(
@@ -197,7 +189,7 @@ pub async fn migrate_tenant(
        dest_ps.conf.id
    );
    let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
-    dest_ps.location_config(tenant_id, dest_conf, None).await?;
+    dest_ps.location_config(tenant_id, dest_conf, None)?;

    println!("✅ Migration complete");

--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -24,4 +24,3 @@ workspace_hack.workspace = true

 [dev-dependencies]
 bincode.workspace = true
-rand.workspace = true
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -144,37 +144,3 @@ impl Key {
 pub fn is_rel_block_key(key: &Key) -> bool {
    key.field1 == 0x00 && key.field4 != 0
 }
-
-impl std::str::FromStr for Key {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
-        Self::from_hex(s)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::str::FromStr;
-
-    use crate::key::Key;
-
-    use rand::Rng;
-    use rand::SeedableRng;
-
-    #[test]
-    fn display_fromstr_bijection() {
-        let mut rng = rand::rngs::StdRng::seed_from_u64(42);
-
-        let key = Key {
-            field1: rng.gen(),
-            field2: rng.gen(),
-            field3: rng.gen(),
-            field4: rng.gen(),
-            field5: rng.gen(),
-            field6: rng.gen(),
-        };
-
-        assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
-    }
-}
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -5,7 +5,6 @@ use const_format::formatcp;
 /// Public API types
 pub mod control_api;
 pub mod key;
-pub mod keyspace;
 pub mod models;
 pub mod reltag;
 pub mod shard;
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1,5 +1,3 @@
-pub mod partitioning;
-
 use std::{
    collections::HashMap,
    num::{NonZeroU64, NonZeroUsize},
--- a/libs/pageserver_api/src/models/partitioning.rs
+++ b/libs/pageserver_api/src/models/partitioning.rs
@@ -1,151 +0,0 @@
-use utils::lsn::Lsn;
-
-#[derive(Debug, PartialEq, Eq)]
-pub struct Partitioning {
-    pub keys: crate::keyspace::KeySpace,
-
-    pub at_lsn: Lsn,
-}
-
-impl serde::Serialize for Partitioning {
-    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        pub struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
-
-        impl<'a> serde::Serialize for KeySpace<'a> {
-            fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
-            where
-                S: serde::Serializer,
-            {
-                use serde::ser::SerializeSeq;
-                let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
-                for kr in &self.0.ranges {
-                    seq.serialize_element(&KeyRange(kr))?;
-                }
-                seq.end()
-            }
-        }
-
-        use serde::ser::SerializeMap;
-        let mut map = serializer.serialize_map(Some(2))?;
-        map.serialize_key("keys")?;
-        map.serialize_value(&KeySpace(&self.keys))?;
-        map.serialize_key("at_lsn")?;
-        map.serialize_value(&WithDisplay(&self.at_lsn))?;
-        map.end()
-    }
-}
-
-pub struct WithDisplay<'a, T>(&'a T);
-
-impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
-    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        serializer.collect_str(&self.0)
-    }
-}
-
-pub struct KeyRange<'a>(&'a std::ops::Range<crate::key::Key>);
-
-impl<'a> serde::Serialize for KeyRange<'a> {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        use serde::ser::SerializeTuple;
-        let mut t = serializer.serialize_tuple(2)?;
-        t.serialize_element(&WithDisplay(&self.0.start))?;
-        t.serialize_element(&WithDisplay(&self.0.end))?;
-        t.end()
-    }
-}
-
-impl<'a> serde::Deserialize<'a> for Partitioning {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'a>,
-    {
-        pub struct KeySpace(crate::keyspace::KeySpace);
-
-        impl<'de> serde::Deserialize<'de> for KeySpace {
-            fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-            where
-                D: serde::Deserializer<'de>,
-            {
-                #[serde_with::serde_as]
-                #[derive(serde::Deserialize)]
-                #[serde(transparent)]
-                struct Key(#[serde_as(as = "serde_with::DisplayFromStr")] crate::key::Key);
-
-                #[serde_with::serde_as]
-                #[derive(serde::Deserialize)]
-                struct Range(Key, Key);
-
-                let ranges: Vec<Range> = serde::Deserialize::deserialize(deserializer)?;
-                Ok(Self(crate::keyspace::KeySpace {
-                    ranges: ranges
-                        .into_iter()
-                        .map(|Range(start, end)| (start.0..end.0))
-                        .collect(),
-                }))
-            }
-        }
-
-        #[serde_with::serde_as]
-        #[derive(serde::Deserialize)]
-        struct De {
-            keys: KeySpace,
-            #[serde_as(as = "serde_with::DisplayFromStr")]
-            at_lsn: Lsn,
-        }
-
-        let de: De = serde::Deserialize::deserialize(deserializer)?;
-        Ok(Self {
-            at_lsn: de.at_lsn,
-            keys: de.keys.0,
-        })
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_serialization_roundtrip() {
-        let reference = r#"
-        {
-            "keys": [
-              [
-                "000000000000000000000000000000000000",
-                "000000000000000000000000000000000001"
-              ],
-              [
-                "000000067F00000001000000000000000000",
-                "000000067F00000001000000000000000002"
-              ],
-              [
-                "030000000000000000000000000000000000",
-                "030000000000000000000000000000000003"
-              ]
-            ],
-            "at_lsn": "0/2240160"
-        }
-        "#;
-
-        let de: Partitioning = serde_json::from_str(reference).unwrap();
-
-        let ser = serde_json::to_string(&de).unwrap();
-
-        let ser_de: serde_json::Value = serde_json::from_str(&ser).unwrap();
-
-        assert_eq!(
-            ser_de,
-            serde_json::from_str::<'_, serde_json::Value>(reference).unwrap()
-        );
-    }
-}
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -163,18 +163,8 @@ impl PgConnectionConfig {
    }

    /// Connect using postgres protocol with TLS disabled.
-    pub async fn connect_no_tls(
-        &self,
-    ) -> Result<
-        (
-            tokio_postgres::Client,
-            tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
-        ),
-        postgres::Error,
-    > {
-        self.to_tokio_postgres_config()
-            .connect(postgres::NoTls)
-            .await
+    pub fn connect_no_tls(&self) -> Result<postgres::Client, postgres::Error> {
+        postgres::Config::from(self.to_tokio_postgres_config()).connect(postgres::NoTls)
    }
 }

--- a/libs/utils/src/timeout.rs
+++ b/libs/utils/src/timeout.rs
@@ -2,11 +2,8 @@ use std::time::Duration;

 use tokio_util::sync::CancellationToken;

-#[derive(thiserror::Error, Debug)]
 pub enum TimeoutCancellableError {
-    #[error("Timed out")]
    Timeout,
-    #[error("Cancelled")]
    Cancelled,
 }

--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -1,6 +1,3 @@
-//! Links with walproposer, pgcommon, pgport and runs bindgen on walproposer.h
-//! to generate Rust bindings for it.
-
 use std::{env, path::PathBuf, process::Command};

 use anyhow::{anyhow, Context};
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -1,6 +1,3 @@
-//! A C-Rust shim: defines implementation of C walproposer API, assuming wp
-//! callback_data stores Box to some Rust implementation.
-
 #![allow(dead_code)]

 use std::ffi::CStr;
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -1,14 +0,0 @@
-[package]
-name = "pageserver_client"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-
-[dependencies]
-pageserver_api.workspace = true
-thiserror.workspace = true
-async-trait.workspace = true
-reqwest.workspace = true
-utils.workspace = true
-serde.workspace = true
-workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/pageserver/client/src/lib.rs
+++ b/pageserver/client/src/lib.rs
@@ -1 +0,0 @@
-pub mod mgmt_api;
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,188 +0,0 @@
-use pageserver_api::models::*;
-use reqwest::{IntoUrl, Method};
-use utils::{
-    http::error::HttpErrorBody,
-    id::{TenantId, TimelineId},
-};
-
-#[derive(Debug)]
-pub struct Client {
-    mgmt_api_endpoint: String,
-    authorization_header: Option<String>,
-    client: reqwest::Client,
-}
-
-#[derive(thiserror::Error, Debug)]
-pub enum Error {
-    #[error("receive body: {0}")]
-    ReceiveBody(reqwest::Error),
-
-    #[error("receive error body: {0}")]
-    ReceiveErrorBody(String),
-
-    #[error("pageserver API: {0}")]
-    ApiError(String),
-}
-
-pub type Result<T> = std::result::Result<T, Error>;
-
-#[async_trait::async_trait]
-pub trait ResponseErrorMessageExt: Sized {
-    async fn error_from_body(self) -> Result<Self>;
-}
-
-#[async_trait::async_trait]
-impl ResponseErrorMessageExt for reqwest::Response {
-    async fn error_from_body(mut self) -> Result<Self> {
-        let status = self.status();
-        if !(status.is_client_error() || status.is_server_error()) {
-            return Ok(self);
-        }
-
-        let url = self.url().to_owned();
-        Err(match self.json::<HttpErrorBody>().await {
-            Ok(HttpErrorBody { msg }) => Error::ApiError(msg),
-            Err(_) => {
-                Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), url))
-            }
-        })
-    }
-}
-
-impl Client {
-    pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
-        Self {
-            mgmt_api_endpoint,
-            authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")),
-            client: reqwest::Client::new(),
-        }
-    }
-
-    pub async fn list_tenants(&self) -> Result<Vec<pageserver_api::models::TenantInfo>> {
-        let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint);
-        let resp = self.get(&uri).await?;
-        resp.json().await.map_err(Error::ReceiveBody)
-    }
-
-    pub async fn list_timelines(
-        &self,
-        tenant_id: TenantId,
-    ) -> Result<Vec<pageserver_api::models::TimelineInfo>> {
-        let uri = format!("{}/v1/tenant/{tenant_id}/timeline", self.mgmt_api_endpoint);
-        self.get(&uri)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
-    pub async fn timeline_info(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> Result<pageserver_api::models::TimelineInfo> {
-        let uri = format!(
-            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
-            self.mgmt_api_endpoint
-        );
-        self.get(&uri)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
-    pub async fn keyspace(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> Result<pageserver_api::models::partitioning::Partitioning> {
-        let uri = format!(
-            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace",
-            self.mgmt_api_endpoint
-        );
-        self.get(&uri)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
-    async fn get<U: IntoUrl>(&self, uri: U) -> Result<reqwest::Response> {
-        self.request(Method::GET, uri, ()).await
-    }
-
-    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
-        &self,
-        method: Method,
-        uri: U,
-        body: B,
-    ) -> Result<reqwest::Response> {
-        let req = self.client.request(method, uri);
-        let req = if let Some(value) = &self.authorization_header {
-            req.header(reqwest::header::AUTHORIZATION, value)
-        } else {
-            req
-        };
-        let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?;
-        let response = res.error_from_body().await?;
-        Ok(response)
-    }
-
-    pub async fn status(&self) -> Result<()> {
-        let uri = format!("{}/v1/status", self.mgmt_api_endpoint);
-        self.get(&uri).await?;
-        Ok(())
-    }
-
-    pub async fn tenant_create(&self, req: &TenantCreateRequest) -> Result<TenantId> {
-        let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint);
-        self.request(Method::POST, &uri, req)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
-    pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
-        let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
-        self.request(Method::PUT, &uri, req).await?;
-        Ok(())
-    }
-
-    pub async fn location_config(
-        &self,
-        tenant_id: TenantId,
-        config: LocationConfig,
-        flush_ms: Option<std::time::Duration>,
-    ) -> Result<()> {
-        let req_body = TenantLocationConfigRequest { tenant_id, config };
-        let path = format!(
-            "{}/v1/tenant/{}/location_config",
-            self.mgmt_api_endpoint, tenant_id
-        );
-        let path = if let Some(flush_ms) = flush_ms {
-            format!("{}?flush_ms={}", path, flush_ms.as_millis())
-        } else {
-            path
-        };
-        self.request(Method::PUT, &path, &req_body).await?;
-        Ok(())
-    }
-
-    pub async fn timeline_create(
-        &self,
-        tenant_id: TenantId,
-        req: &TimelineCreateRequest,
-    ) -> Result<TimelineInfo> {
-        let uri = format!(
-            "{}/v1/tenant/{}/timeline",
-            self.mgmt_api_endpoint, tenant_id
-        );
-        self.request(Method::POST, &uri, req)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-}
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -41,8 +41,6 @@ use crate::{
    TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
 };

-use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
-
 pub mod defaults {
    use crate::tenant::config::defaults::*;
    use const_format::formatcp;
@@ -63,8 +61,6 @@ pub mod defaults {

    pub const DEFAULT_LOG_FORMAT: &str = "plain";

-    pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8;
-
    pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
        super::ConfigurableSemaphore::DEFAULT_INITIAL.get();

@@ -98,7 +94,6 @@ pub mod defaults {
 #log_format = '{DEFAULT_LOG_FORMAT}'

 #concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
-#concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}'

 #metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
 #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
@@ -185,11 +180,6 @@ pub struct PageServerConf {

    pub log_format: LogFormat,

-    /// Number of tenants which will be concurrently loaded from remote storage proactively on startup,
-    /// does not limit tenants loaded in response to client I/O.  A lower value implicitly deprioritizes
-    /// loading such tenants, vs. other work in the system.
-    pub concurrent_tenant_warmup: ConfigurableSemaphore,
-
    /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
    pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
    /// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
@@ -293,7 +283,6 @@ struct PageServerConfigBuilder {

    log_format: BuilderValue<LogFormat>,

-    concurrent_tenant_warmup: BuilderValue<NonZeroUsize>,
    concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,

    metric_collection_interval: BuilderValue<Duration>,
@@ -351,8 +340,6 @@ impl Default for PageServerConfigBuilder {
            .expect("cannot parse default keepalive interval")),
            log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),

-            concurrent_tenant_warmup: Set(NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
-                .expect("Invalid default constant")),
            concurrent_tenant_size_logical_size_queries: Set(
                ConfigurableSemaphore::DEFAULT_INITIAL,
            ),
@@ -466,10 +453,6 @@ impl PageServerConfigBuilder {
        self.log_format = BuilderValue::Set(log_format)
    }

-    pub fn concurrent_tenant_warmup(&mut self, u: NonZeroUsize) {
-        self.concurrent_tenant_warmup = BuilderValue::Set(u);
-    }
-
    pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) {
        self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
    }
@@ -535,9 +518,6 @@ impl PageServerConfigBuilder {
    }

    pub fn build(self) -> anyhow::Result<PageServerConf> {
-        let concurrent_tenant_warmup = self
-            .concurrent_tenant_warmup
-            .ok_or(anyhow!("missing concurrent_tenant_warmup"))?;
        let concurrent_tenant_size_logical_size_queries = self
            .concurrent_tenant_size_logical_size_queries
            .ok_or(anyhow!(
@@ -590,7 +570,6 @@ impl PageServerConfigBuilder {
                .broker_keepalive_interval
                .ok_or(anyhow!("No broker keepalive interval provided"))?,
            log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
-            concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup),
            concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
                concurrent_tenant_size_logical_size_queries,
            ),
@@ -828,11 +807,6 @@ impl PageServerConf {
                "log_format" => builder.log_format(
                    LogFormat::from_config(&parse_toml_string(key, item)?)?
                ),
-                "concurrent_tenant_warmup" => builder.concurrent_tenant_warmup({
-                    let input = parse_toml_string(key, item)?;
-                    let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
-                    NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
-                }),
                "concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({
                    let input = parse_toml_string(key, item)?;
                    let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
@@ -930,10 +904,6 @@ impl PageServerConf {
            broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
            broker_keepalive_interval: Duration::from_secs(5000),
            log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
-            concurrent_tenant_warmup: ConfigurableSemaphore::new(
-                NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
-                    .expect("Invalid default constant"),
-            ),
            concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
            eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
            ),
@@ -1152,9 +1122,6 @@ background_task_maximum_delay = '334 s'
                    storage_broker::DEFAULT_KEEPALIVE_INTERVAL
                )?,
                log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
-                concurrent_tenant_warmup: ConfigurableSemaphore::new(
-                    NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap()
-                ),
                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
                eviction_task_immitated_concurrent_logical_size_queries:
                    ConfigurableSemaphore::default(),
@@ -1221,9 +1188,6 @@ background_task_maximum_delay = '334 s'
                broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
                broker_keepalive_interval: Duration::from_secs(5),
                log_format: LogFormat::Json,
-                concurrent_tenant_warmup: ConfigurableSemaphore::new(
-                    NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap()
-                ),
                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
                eviction_task_immitated_concurrent_logical_size_queries:
                    ConfigurableSemaphore::default(),
--- a/pageserver/src/http/mod.rs
+++ b/pageserver/src/http/mod.rs
@@ -1,2 +1,4 @@
 pub mod routes;
 pub use routes::make_router;
+
+pub use pageserver_api::models;
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -28,13 +28,16 @@ use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};

+use super::models::{
+    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
+    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
+};
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
-use crate::tenant::mgr::GetActiveTenantError;
 use crate::tenant::mgr::{
    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
    TenantSlotError, TenantSlotUpsertError, TenantStateError,
@@ -47,10 +50,6 @@ use crate::tenant::timeline::Timeline;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
-use pageserver_api::models::{
-    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
-    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
-};
 use utils::{
    auth::SwappableJwtAuth,
    generation::Generation,
@@ -66,12 +65,7 @@ use utils::{
 };

 // Imports only used for testing APIs
-use pageserver_api::models::ConfigureFailpointsRequest;
-
-// For APIs that require an Active tenant, how long should we block waiting for that state?
-// This is not functionally necessary (clients will retry), but avoids generating a lot of
-// failed API calls while tenants are activating.
-const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
+use super::models::ConfigureFailpointsRequest;

 pub struct State {
    conf: &'static PageServerConf,
@@ -239,19 +233,6 @@ impl From<GetTenantError> for ApiError {
    }
 }

-impl From<GetActiveTenantError> for ApiError {
-    fn from(e: GetActiveTenantError) -> ApiError {
-        match e {
-            GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
-            GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
-            GetActiveTenantError::NotFound(gte) => gte.into(),
-            GetActiveTenantError::WaitForActiveTimeout { .. } => {
-                ApiError::ResourceUnavailable(format!("{}", e).into())
-            }
-        }
-    }
-}
-
 impl From<SetNewTenantConfigError> for ApiError {
    fn from(e: SetNewTenantConfigError) -> ApiError {
        match e {
@@ -454,10 +435,7 @@ async fn timeline_create_handler(
    let state = get_state(&request);

    async {
-        let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
-
-        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
+        let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, true)?;
        match tenant.create_timeline(
            new_timeline_id,
            request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -716,23 +694,11 @@ async fn timeline_delete_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
    let state = get_state(&request);

-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id, false)
-        .map_err(|e| {
-            match e {
-                // GetTenantError has a built-in conversion to ApiError, but in this context we don't
-                // want to treat missing tenants as 404, to avoid ambiguity with successful deletions.
-                GetTenantError::NotFound(_) => ApiError::PreconditionFailed(
-                    "Requested tenant is missing".to_string().into_boxed_str(),
-                ),
-                e => e.into(),
-            }
-        })?;
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-    tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
+    state.tenant_manager.delete_timeline(tenant_shard_id, timeline_id, &ctx)
+        .instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
        .await?;

    json_response(StatusCode::ACCEPTED, ())
@@ -1170,10 +1136,7 @@ async fn tenant_create_handler(

    // We created the tenant. Existing API semantics are that the tenant
    // is Active when this function returns.
-    if let res @ Err(_) = new_tenant
-        .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
-        .await
-    {
+    if let res @ Err(_) = new_tenant.wait_to_become_active().await {
        // This shouldn't happen because we just created the tenant directory
        // in tenant::mgr::create_tenant, and there aren't any remote timelines
        // to load, so, nothing can really fail during load.
@@ -1524,6 +1487,69 @@ async fn timeline_collect_keyspace(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

+    struct Partitioning {
+        keys: crate::keyspace::KeySpace,
+
+        at_lsn: Lsn,
+    }
+
+    impl serde::Serialize for Partitioning {
+        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+        where
+            S: serde::Serializer,
+        {
+            use serde::ser::SerializeMap;
+            let mut map = serializer.serialize_map(Some(2))?;
+            map.serialize_key("keys")?;
+            map.serialize_value(&KeySpace(&self.keys))?;
+            map.serialize_key("at_lsn")?;
+            map.serialize_value(&WithDisplay(&self.at_lsn))?;
+            map.end()
+        }
+    }
+
+    struct WithDisplay<'a, T>(&'a T);
+
+    impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
+        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+        where
+            S: serde::Serializer,
+        {
+            serializer.collect_str(&self.0)
+        }
+    }
+
+    struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
+
+    impl<'a> serde::Serialize for KeySpace<'a> {
+        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+        where
+            S: serde::Serializer,
+        {
+            use serde::ser::SerializeSeq;
+            let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
+            for kr in &self.0.ranges {
+                seq.serialize_element(&KeyRange(kr))?;
+            }
+            seq.end()
+        }
+    }
+
+    struct KeyRange<'a>(&'a std::ops::Range<crate::repository::Key>);
+
+    impl<'a> serde::Serialize for KeyRange<'a> {
+        fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+        where
+            S: serde::Serializer,
+        {
+            use serde::ser::SerializeTuple;
+            let mut t = serializer.serialize_tuple(2)?;
+            t.serialize_element(&WithDisplay(&self.0.start))?;
+            t.serialize_element(&WithDisplay(&self.0.end))?;
+            t.end()
+        }
+    }
+
    let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;

    async {
@@ -1535,9 +1561,7 @@ async fn timeline_collect_keyspace(
            .await
            .map_err(|e| ApiError::InternalServerError(e.into()))?;

-        let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn };
-
-        json_response(StatusCode::OK, res)
+        json_response(StatusCode::OK, Partitioning { keys, at_lsn })
    }
    .instrument(info_span!("timeline_collect_keyspace", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
    .await
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -1,12 +1,11 @@
+use crate::repository::{key_range_size, singleton_range, Key};
 use postgres_ffi::BLCKSZ;
 use std::ops::Range;

-use crate::key::Key;
-
 ///
 /// Represents a set of Keys, in a compact form.
 ///
-#[derive(Clone, Debug, Default, PartialEq, Eq)]
+#[derive(Clone, Debug, Default)]
 pub struct KeySpace {
    /// Contiguous ranges of keys that belong to the key space. In key order,
    /// and with no overlap.
@@ -187,33 +186,6 @@ impl KeySpaceRandomAccum {
    }
 }

-pub fn key_range_size(key_range: &Range<Key>) -> u32 {
-    let start = key_range.start;
-    let end = key_range.end;
-
-    if end.field1 != start.field1
-        || end.field2 != start.field2
-        || end.field3 != start.field3
-        || end.field4 != start.field4
-    {
-        return u32::MAX;
-    }
-
-    let start = (start.field5 as u64) << 32 | start.field6 as u64;
-    let end = (end.field5 as u64) << 32 | end.field6 as u64;
-
-    let diff = end - start;
-    if diff > u32::MAX as u64 {
-        u32::MAX
-    } else {
-        diff as u32
-    }
-}
-
-pub fn singleton_range(key: Key) -> Range<Key> {
-    key..key.next()
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -10,7 +10,7 @@ pub mod deletion_queue;
 pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
-pub use pageserver_api::keyspace;
+pub mod keyspace;
 pub mod metrics;
 pub mod page_cache;
 pub mod page_service;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -522,18 +522,14 @@ pub(crate) mod initial_logical_size {
    impl StartCalculation {
        pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
            let circumstances_label: &'static str = circumstances.into();
-            self.0
-                .with_label_values(&["first", circumstances_label])
-                .inc();
+            self.0.with_label_values(&["first", circumstances_label]);
            OngoingCalculationGuard {
                inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
            }
        }
        pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
            let circumstances_label: &'static str = circumstances.into();
-            self.0
-                .with_label_values(&["retry", circumstances_label])
-                .inc();
+            self.0.with_label_values(&["retry", circumstances_label]);
            OngoingCalculationGuard {
                inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
            }
@@ -688,54 +684,14 @@ pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
    .expect("Failed to register pageserver_startup_is_loading")
 });

-/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
-/// like how long it took to load.
-///
-/// Note that these are process-global metrics, _not_ per-tenant metrics.  Per-tenant
-/// metrics are rather expensive, and usually fine grained stuff makes more sense
-/// at a timeline level than tenant level.
-pub(crate) struct TenantMetrics {
-    /// How long did tenants take to go from construction to active state?
-    pub(crate) activation: Histogram,
-    pub(crate) preload: Histogram,
-    pub(crate) attach: Histogram,
-
-    /// How many tenants are included in the initial startup of the pagesrever?
-    pub(crate) startup_scheduled: IntCounter,
-    pub(crate) startup_complete: IntCounter,
-}
-
-pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
-    TenantMetrics {
-    activation: register_histogram!(
+/// How long did tenants take to go from construction to active state?
+pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
        "pageserver_tenant_activation_seconds",
        "Time taken by tenants to activate, in seconds",
        CRITICAL_OP_BUCKETS.into()
    )
-    .expect("Failed to register metric"),
-    preload: register_histogram!(
-        "pageserver_tenant_preload_seconds",
-        "Time taken by tenants to load remote metadata on startup/attach, in seconds",
-        CRITICAL_OP_BUCKETS.into()
-    )
-    .expect("Failed to register metric"),
-    attach: register_histogram!(
-        "pageserver_tenant_attach_seconds",
-        "Time taken by tenants to intialize, after remote metadata is already loaded",
-        CRITICAL_OP_BUCKETS.into()
-    )
-    .expect("Failed to register metric"),
-    startup_scheduled: register_int_counter!(
-        "pageserver_tenant_startup_scheduled",
-        "Number of tenants included in pageserver startup (doesn't count tenants attached later)"
-    ).expect("Failed to register metric"),
-    startup_complete: register_int_counter!(
-        "pageserver_tenant_startup_complete",
-        "Number of tenants that have completed warm-up, or activated on-demand during initial startup: \
-         should eventually reach `pageserver_tenant_startup_scheduled_total`.  Does not include broken \
-         tenants: such cases will lead to this metric never reaching the scheduled count."
-    ).expect("Failed to register metric"),
-}
+    .expect("Failed to register pageserver_tenant_activation_seconds metric")
 });

 /// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
@@ -1023,62 +979,12 @@ static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy<Vec<f64>> = Lazy::new(|| {
-    [
-        1,
-        10,
-        20,
-        40,
-        60,
-        80,
-        100,
-        200,
-        300,
-        400,
-        500,
-        600,
-        700,
-        800,
-        900,
-        1_000, // 1ms
-        2_000,
-        4_000,
-        6_000,
-        8_000,
-        10_000, // 10ms
-        20_000,
-        40_000,
-        60_000,
-        80_000,
-        100_000,
-        200_000,
-        400_000,
-        600_000,
-        800_000,
-        1_000_000, // 1s
-        2_000_000,
-        4_000_000,
-        6_000_000,
-        8_000_000,
-        10_000_000, // 10s
-        20_000_000,
-        50_000_000,
-        100_000_000,
-        200_000_000,
-        1_000_000_000, // 1000s
-    ]
-    .into_iter()
-    .map(Duration::from_micros)
-    .map(|d| d.as_secs_f64())
-    .collect()
-});
-
 static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds_global",
        "Time spent on smgr query handling, aggregated by query type.",
        &["smgr_query_type"],
-        SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(),
+        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });
@@ -2307,9 +2213,6 @@ pub fn preinitialize_metrics() {
    // Deletion queue stats
    Lazy::force(&DELETION_QUEUE);

-    // Tenant stats
-    Lazy::force(&TENANT);
-
    // Tenant manager stats
    Lazy::force(&TENANT_MANAGER);

--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -2,11 +2,38 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::Result;
 use bytes::Bytes;
 use serde::{Deserialize, Serialize};
-use std::ops::AddAssign;
+use std::ops::{AddAssign, Range};
 use std::time::Duration;

 pub use pageserver_api::key::{Key, KEY_SIZE};

+pub fn key_range_size(key_range: &Range<Key>) -> u32 {
+    let start = key_range.start;
+    let end = key_range.end;
+
+    if end.field1 != start.field1
+        || end.field2 != start.field2
+        || end.field3 != start.field3
+        || end.field4 != start.field4
+    {
+        return u32::MAX;
+    }
+
+    let start = (start.field5 as u64) << 32 | start.field6 as u64;
+    let end = (end.field5 as u64) << 32 | end.field6 as u64;
+
+    let diff = end - start;
+    if diff > u32::MAX as u64 {
+        u32::MAX
+    } else {
+        diff as u32
+    }
+}
+
+pub fn singleton_range(key: Key) -> Range<Key> {
+    key..key.next()
+}
+
 /// A 'value' stored for a one Key.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 #[cfg_attr(test, derive(PartialEq))]
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -36,8 +36,6 @@ use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext;
 use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
-use utils::timeout::timeout_cancellable;
-use utils::timeout::TimeoutCancellableError;

 use self::config::AttachedLocationConfig;
 use self::config::AttachmentMode;
@@ -61,7 +59,7 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
 use crate::is_uninit_mark;
-use crate::metrics::TENANT;
+use crate::metrics::TENANT_ACTIVATION;
 use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
 use crate::repository::GcResult;
 use crate::task_mgr;
@@ -228,7 +226,7 @@ pub struct Tenant {

    /// The value creation timestamp, used to measure activation delay, see:
    /// <https://github.com/neondatabase/neon/issues/4025>
-    constructed_at: Instant,
+    loading_started_at: Instant,

    state: watch::Sender<TenantState>,

@@ -278,11 +276,6 @@ pub struct Tenant {

    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,

-    /// If the tenant is in Activating state, notify this to encourage it
-    /// to proceed to Active as soon as possible, rather than waiting for lazy
-    /// background warmup.
-    pub(crate) activate_now_sem: tokio::sync::Semaphore,
-
    pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,

    // Cancellation token fires when we have entered shutdown().  This is a parent of
@@ -629,14 +622,6 @@ impl Tenant {
            "attach tenant",
            false,
            async move {
-                // Is this tenant being spawned as part of process startup?
-                let starting_up = init_order.is_some();
-                scopeguard::defer! {
-                    if starting_up {
-                        TENANT.startup_complete.inc();
-                    }
-                }
-
                // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
                let make_broken =
                    |t: &Tenant, err: anyhow::Error| {
@@ -663,62 +648,8 @@ impl Tenant {
                    .as_mut()
                    .and_then(|x| x.initial_tenant_load_remote.take());

-                enum AttachType<'a> {
-                    // During pageserver startup, we are attaching this tenant lazily in the background
-                    Warmup(tokio::sync::SemaphorePermit<'a>),
-                    // During pageserver startup, we are attaching this tenant as soon as we can,
-                    // because a client tried to access it.
-                    OnDemand,
-                    // During normal operations after startup, we are attaching a tenant.
-                    Normal,
-                }
-
-                // Before doing any I/O, wait for either or:
-                // - A client to attempt to access to this tenant (on-demand loading)
-                // - A permit to become available in the warmup semaphore (background warmup)
-                //
-                // Some-ness of init_order is how we know if we're attaching during startup or later
-                // in process lifetime.
-                let attach_type = if init_order.is_some() {
-                    tokio::select!(
-                        _ = tenant_clone.activate_now_sem.acquire() => {
-                            tracing::info!("Activating tenant (on-demand)");
-                            AttachType::OnDemand
-                        },
-                        permit_result = conf.concurrent_tenant_warmup.inner().acquire() => {
-                            match permit_result {
-                                Ok(p) => {
-                                    tracing::info!("Activating tenant (warmup)");
-                                    AttachType::Warmup(p)
-                                }
-                                Err(_) => {
-                                    // This is unexpected: the warmup semaphore should stay alive
-                                    // for the lifetime of init_order.  Log a warning and proceed.
-                                    tracing::warn!("warmup_limit semaphore unexpectedly closed");
-                                    AttachType::Normal
-                                }
-                            }
-
-                        }
-                        _ = tenant_clone.cancel.cancelled() => {
-                            // This is safe, but should be pretty rare: it is interesting if a tenant
-                            // stayed in Activating for such a long time that shutdown found it in
-                            // that state.
-                            tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation");
-                            return Ok(());
-                        },
-                    )
-                } else {
-                    AttachType::Normal
-                };
-
-                let preload_timer = TENANT.preload.start_timer();
                let preload = match mode {
-                    SpawnMode::Create => {
-                        // Don't count the skipped preload into the histogram of preload durations
-                        preload_timer.stop_and_discard();
-                        None
-                    },
+                    SpawnMode::Create => {None},
                    SpawnMode::Normal => {
                        match &remote_storage {
                            Some(remote_storage) => Some(
@@ -728,11 +659,7 @@ impl Tenant {
                                        tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()),
                                    )
                                    .await {
-                                        Ok(p) => {
-                                            preload_timer.observe_duration();
-                                            p
-                                        }
-                                            ,
+                                        Ok(p) => p,
                                        Err(e) => {
                                            make_broken(&tenant_clone, anyhow::anyhow!(e));
                                                return Ok(());
@@ -794,43 +721,15 @@ impl Tenant {
                    }
                }

-                // We will time the duration of the attach phase unless this is a creation (attach will do no work)
-                let attach_timer = match mode {
-                    SpawnMode::Create => None,
-                    SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
-                };
                match tenant_clone.attach(preload, &ctx).await {
                    Ok(()) => {
                        info!("attach finished, activating");
-                        if let Some(t)=  attach_timer {t.observe_duration();}
                        tenant_clone.activate(broker_client, None, &ctx);
                    }
                    Err(e) => {
-                        if let Some(t)=  attach_timer {t.observe_duration();}
                        make_broken(&tenant_clone, anyhow::anyhow!(e));
                    }
                }
-
-                // If we are doing an opportunistic warmup attachment at startup, initialize
-                // logical size at the same time.  This is better than starting a bunch of idle tenants
-                // with cold caches and then coming back later to initialize their logical sizes.
-                //
-                // It also prevents the warmup proccess competing with the concurrency limit on
-                // logical size calculations: if logical size calculation semaphore is saturated,
-                // then warmup will wait for that before proceeding to the next tenant.
-                if let AttachType::Warmup(_permit) = attach_type {
-                    let mut futs = FuturesUnordered::new();
-                    let timelines: Vec<_> = tenant_clone.timelines.lock().unwrap().values().cloned().collect();
-                    for t in timelines {
-                        futs.push(t.await_initial_logical_size())
-                    }
-                    tracing::info!("Waiting for initial logical sizes while warming up...");
-                    while futs.next().await.is_some() {
-
-                    }
-                    tracing::info!("Warm-up complete");
-                }
-
                Ok(())
            }
            .instrument({
@@ -1797,15 +1696,6 @@ impl Tenant {
        Ok(loaded_timeline)
    }

-    pub(crate) async fn delete_timeline(
-        self: Arc<Self>,
-        timeline_id: TimelineId,
-    ) -> Result<(), DeleteTimelineError> {
-        DeleteTimelineFlow::run(&self, timeline_id, false).await?;
-
-        Ok(())
-    }
-
    /// perform one garbage collection iteration, removing old data files from disk.
    /// this function is periodically called by gc task.
    /// also it can be explicitly requested through page server api 'do_gc' command.
@@ -1967,7 +1857,7 @@ impl Tenant {
                );
                *current_state = TenantState::Active;

-                let elapsed = self.constructed_at.elapsed();
+                let elapsed = self.loading_started_at.elapsed();
                let total_timelines = timelines_accessor.len();

                // log a lot of stuff, because some tenants sometimes suffer from user-visible
@@ -1982,7 +1872,7 @@ impl Tenant {
                    "activation attempt finished"
                );

-                TENANT.activation.observe(elapsed.as_secs_f64());
+                TENANT_ACTIVATION.observe(elapsed.as_secs_f64());
            });
        }
    }
@@ -2237,41 +2127,18 @@ impl Tenant {
        self.state.subscribe()
    }

-    /// The activate_now semaphore is initialized with zero units.  As soon as
-    /// we add a unit, waiters will be able to acquire a unit and proceed.
-    pub(crate) fn activate_now(&self) {
-        self.activate_now_sem.add_permits(1);
-    }
-
-    pub(crate) async fn wait_to_become_active(
-        &self,
-        timeout: Duration,
-    ) -> Result<(), GetActiveTenantError> {
+    pub(crate) async fn wait_to_become_active(&self) -> Result<(), GetActiveTenantError> {
        let mut receiver = self.state.subscribe();
        loop {
            let current_state = receiver.borrow_and_update().clone();
            match current_state {
                TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
                    // in these states, there's a chance that we can reach ::Active
-                    self.activate_now();
-                    match timeout_cancellable(timeout, &self.cancel, receiver.changed()).await {
-                        Ok(r) => {
-                            r.map_err(
-                            |_e: tokio::sync::watch::error::RecvError|
-                                // Tenant existed but was dropped: report it as non-existent
-                                GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id))
-                        )?
-                        }
-                        Err(TimeoutCancellableError::Cancelled) => {
-                            return Err(GetActiveTenantError::Cancelled);
-                        }
-                        Err(TimeoutCancellableError::Timeout) => {
-                            return Err(GetActiveTenantError::WaitForActiveTimeout {
-                                latest_state: Some(self.current_state()),
-                                wait_time: timeout,
-                            });
-                        }
-                    }
+                    receiver.changed().await.map_err(
+                        |_e: tokio::sync::watch::error::RecvError|
+                            // Tenant existed but was dropped: report it as non-existent
+                            GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id))
+                    )?;
                }
                TenantState::Active { .. } => {
                    return Ok(());
@@ -2596,7 +2463,7 @@ impl Tenant {
            conf,
            // using now here is good enough approximation to catch tenants with really long
            // activation times.
-            constructed_at: Instant::now(),
+            loading_started_at: Instant::now(),
            tenant_conf: Arc::new(RwLock::new(attached_conf)),
            timelines: Mutex::new(HashMap::new()),
            timelines_creating: Mutex::new(HashSet::new()),
@@ -2608,7 +2475,6 @@ impl Tenant {
            cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
            eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
-            activate_now_sem: tokio::sync::Semaphore::new(0),
            delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
            cancel: CancellationToken::default(),
            gate: Gate::new(format!("Tenant<{tenant_shard_id}>")),
@@ -3193,7 +3059,6 @@ impl Tenant {
                    storage,
                    &self.tenant_shard_id,
                    &existing_initdb_timeline_id,
-                    &self.cancel,
                )
                .await
                .context("download initdb tar")?;
@@ -3234,7 +3099,6 @@ impl Tenant {
                            &timeline_id,
                            pgdata_zstd.try_clone().await?,
                            tar_zst_size,
-                            &self.cancel,
                        )
                        .await
                    },
@@ -3242,7 +3106,9 @@ impl Tenant {
                    3,
                    u32::MAX,
                    "persist_initdb_tar_zst",
-                    backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
+                    backoff::Cancel::new(self.cancel.clone(), || {
+                        anyhow::anyhow!("initdb upload cancelled")
+                    }),
                )
                .await?;

--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -71,7 +71,6 @@ async fn create_remote_delete_mark(
    conf: &PageServerConf,
    remote_storage: &GenericRemoteStorage,
    tenant_shard_id: &TenantShardId,
-    cancel: &CancellationToken,
 ) -> Result<(), DeleteTenantError> {
    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;

@@ -88,7 +87,8 @@ async fn create_remote_delete_mark(
        FAILED_UPLOAD_WARN_THRESHOLD,
        FAILED_REMOTE_OP_RETRIES,
        "mark_upload",
-        backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
+        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+        backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
    )
    .await
    .context("mark_upload")?;
@@ -170,7 +170,6 @@ async fn remove_tenant_remote_delete_mark(
    conf: &PageServerConf,
    remote_storage: Option<&GenericRemoteStorage>,
    tenant_shard_id: &TenantShardId,
-    cancel: &CancellationToken,
 ) -> Result<(), DeleteTenantError> {
    if let Some(remote_storage) = remote_storage {
        let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
@@ -180,7 +179,8 @@ async fn remove_tenant_remote_delete_mark(
            FAILED_UPLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "remove_tenant_remote_delete_mark",
-            backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
+            // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
        )
        .await
        .context("remove_tenant_remote_delete_mark")?;
@@ -322,15 +322,9 @@ impl DeleteTenantFlow {
        // Though sounds scary, different mark name?
        // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
        if let Some(remote_storage) = &remote_storage {
-            create_remote_delete_mark(
-                conf,
-                remote_storage,
-                &tenant.tenant_shard_id,
-                // Can't use tenant.cancel, it's already shut down.  TODO: wire in an appropriate token
-                &CancellationToken::new(),
-            )
-            .await
-            .context("remote_mark")?
+            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id)
+                .await
+                .context("remote_mark")?
        }

        fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
@@ -530,14 +524,8 @@ impl DeleteTenantFlow {
                .context("timelines dir not empty")?;
        }

-        remove_tenant_remote_delete_mark(
-            conf,
-            remote_storage.as_ref(),
-            &tenant.tenant_shard_id,
-            // Can't use tenant.cancel, it's already shut down.  TODO: wire in an appropriate token
-            &CancellationToken::new(),
-        )
-        .await?;
+        remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_shard_id)
+            .await?;

        fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
            Err(anyhow::anyhow!(
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -28,7 +28,7 @@ use crate::control_plane_client::{
    ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
 };
 use crate::deletion_queue::DeletionQueueClient;
-use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
+use crate::metrics::TENANT_MANAGER as METRICS;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
@@ -44,6 +44,7 @@ use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};

 use super::delete::DeleteTenantError;
+use super::timeline::delete::DeleteTimelineFlow;
 use super::TenantSharedResources;

 /// For a tenant that appears in TenantsMap, it may either be
@@ -429,13 +430,6 @@ pub async fn init_tenant_mgr(
    let tenant_generations =
        init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;

-    tracing::info!(
-        "Attaching {} tenants at startup, warming up {} at a time",
-        tenant_configs.len(),
-        conf.concurrent_tenant_warmup.initial_permits()
-    );
-    TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);
-
    // Construct `Tenant` objects and start them running
    for (tenant_shard_id, location_conf) in tenant_configs {
        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
@@ -854,6 +848,17 @@ impl TenantManager {
        }
    }

+    pub(crate) async fn delete_timeline(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        _ctx: &RequestContext,
+    ) -> Result<(), DeleteTimelineError> {
+        let tenant = self.get_attached_tenant_shard(tenant_shard_id, true)?;
+        DeleteTimelineFlow::run(&tenant, timeline_id, false).await?;
+        Ok(())
+    }
+
    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
    pub(crate) async fn upsert_location(
        &self,
@@ -1216,10 +1221,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
                        // Fast path: we don't need to do any async waiting.
                        return Ok(tenant.clone());
                    }
-                    _ => {
-                        tenant.activate_now();
-                        (WaitFor::Tenant(tenant.clone()), tenant_shard_id)
-                    }
+                    _ => (WaitFor::Tenant(tenant.clone()), tenant_shard_id),
                }
            }
            Some(TenantSlot::Secondary) => {
@@ -1273,10 +1275,28 @@ pub(crate) async fn get_active_tenant_with_timeout(
    };

    tracing::debug!("Waiting for tenant to enter active state...");
-    tenant
-        .wait_to_become_active(deadline.duration_since(Instant::now()))
-        .await?;
-    Ok(tenant)
+    match timeout_cancellable(
+        deadline.duration_since(Instant::now()),
+        cancel,
+        tenant.wait_to_become_active(),
+    )
+    .await
+    {
+        Ok(Ok(())) => Ok(tenant),
+        Ok(Err(e)) => Err(e),
+        Err(TimeoutCancellableError::Timeout) => {
+            let latest_state = tenant.current_state();
+            if latest_state == TenantState::Active {
+                Ok(tenant)
+            } else {
+                Err(GetActiveTenantError::WaitForActiveTimeout {
+                    latest_state: Some(latest_state),
+                    wait_time: timeout,
+                })
+            }
+        }
+        Err(TimeoutCancellableError::Cancelled) => Err(GetActiveTenantError::Cancelled),
+    }
 }

 pub(crate) async fn delete_tenant(
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -196,12 +196,10 @@ pub(crate) use upload::upload_initdb_dir;
 use utils::backoff::{
    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
-use utils::timeout::{timeout_cancellable, TimeoutCancellableError};

 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
-use std::time::Duration;

 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use std::ops::DerefMut;
@@ -318,47 +316,6 @@ pub struct RemoteTimelineClient {
    storage_impl: GenericRemoteStorage,

    deletion_queue_client: DeletionQueueClient,
-
-    cancel: CancellationToken,
-}
-
-/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows.  It is not
-/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
-const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
-const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
-
-/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow.
-///
-/// This is a convenience for the various upload functions.  In future
-/// the anyhow::Error result should be replaced with a more structured type that
-/// enables callers to avoid handling shutdown as an error.
-async fn upload_cancellable<F>(cancel: &CancellationToken, future: F) -> anyhow::Result<()>
-where
-    F: std::future::Future<Output = anyhow::Result<()>>,
-{
-    match timeout_cancellable(UPLOAD_TIMEOUT, cancel, future).await {
-        Ok(Ok(())) => Ok(()),
-        Ok(Err(e)) => Err(e),
-        Err(TimeoutCancellableError::Timeout) => Err(anyhow::anyhow!("Timeout")),
-        Err(TimeoutCancellableError::Cancelled) => Err(anyhow::anyhow!("Shutting down")),
-    }
-}
-/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to DownloaDError.
-async fn download_cancellable<F, R>(
-    cancel: &CancellationToken,
-    future: F,
-) -> Result<R, DownloadError>
-where
-    F: std::future::Future<Output = Result<R, DownloadError>>,
-{
-    match timeout_cancellable(DOWNLOAD_TIMEOUT, cancel, future).await {
-        Ok(Ok(r)) => Ok(r),
-        Ok(Err(e)) => Err(e),
-        Err(TimeoutCancellableError::Timeout) => {
-            Err(DownloadError::Other(anyhow::anyhow!("Timed out")))
-        }
-        Err(TimeoutCancellableError::Cancelled) => Err(DownloadError::Cancelled),
-    }
 }

 impl RemoteTimelineClient {
@@ -394,7 +351,6 @@ impl RemoteTimelineClient {
                &tenant_shard_id,
                &timeline_id,
            )),
-            cancel: CancellationToken::new(),
        }
    }

@@ -545,7 +501,6 @@ impl RemoteTimelineClient {
        &self,
        layer_file_name: &LayerFileName,
        layer_metadata: &LayerFileMetadata,
-        cancel: &CancellationToken,
    ) -> anyhow::Result<u64> {
        let downloaded_size = {
            let _unfinished_gauge_guard = self.metrics.call_begin(
@@ -562,7 +517,6 @@ impl RemoteTimelineClient {
                self.timeline_id,
                layer_file_name,
                layer_metadata,
-                cancel,
            )
            .measure_remote_op(
                self.tenant_shard_id.tenant_id,
@@ -1017,7 +971,6 @@ impl RemoteTimelineClient {
                    &self.timeline_id,
                    self.generation,
                    &index_part_with_deleted_at,
-                    &self.cancel,
                )
            },
            |_e| false,
@@ -1027,7 +980,8 @@ impl RemoteTimelineClient {
            // when executed as part of tenant deletion this happens in the background
            2,
            "persist_index_part_with_deleted_flag",
-            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
+            // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
        )
        .await?;

@@ -1327,7 +1281,6 @@ impl RemoteTimelineClient {
                        path,
                        layer_metadata,
                        self.generation,
-                        &self.cancel,
                    )
                    .measure_remote_op(
                        self.tenant_shard_id.tenant_id,
@@ -1354,7 +1307,6 @@ impl RemoteTimelineClient {
                        &self.timeline_id,
                        self.generation,
                        index_part,
-                        &self.cancel,
                    )
                    .measure_remote_op(
                        self.tenant_shard_id.tenant_id,
@@ -1876,7 +1828,6 @@ mod tests {
                    &self.harness.tenant_shard_id,
                    &TIMELINE_ID,
                )),
-                cancel: CancellationToken::new(),
            })
        }

--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -5,6 +5,7 @@

 use std::collections::HashSet;
 use std::future::Future;
+use std::time::Duration;

 use anyhow::{anyhow, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -13,17 +14,13 @@ use tokio::fs::{self, File, OpenOptions};
 use tokio::io::{AsyncSeekExt, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
-use utils::timeout::timeout_cancellable;
 use utils::{backoff, crashsafe};

 use crate::config::PageServerConf;
-use crate::tenant::remote_timeline_client::{
-    download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT,
-};
+use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::Generation;
-use crate::virtual_file::on_fatal_io_error;
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
 use utils::crashsafe::path_with_suffix_extension;
@@ -35,6 +32,8 @@ use super::{
    FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
 };

+static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
+
 ///
 /// If 'metadata' is given, we will validate that the downloaded file's size matches that
 /// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
@@ -47,7 +46,6 @@ pub async fn download_layer_file<'a>(
    timeline_id: TimelineId,
    layer_file_name: &'a LayerFileName,
    layer_metadata: &'a LayerFileMetadata,
-    cancel: &CancellationToken,
 ) -> Result<u64, DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

@@ -75,18 +73,14 @@ pub async fn download_layer_file<'a>(
    // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
    let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);

-    let cancel_inner = cancel.clone();
    let (mut destination_file, bytes_amount) = download_retry(
        || async {
            let destination_file = tokio::fs::File::create(&temp_file_path)
                .await
                .with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
                .map_err(DownloadError::Other)?;
-
-            // Cancellation safety: it is safe to cancel this future, because it isn't writing to a local
-            // file: the write to local file doesn't start until after the request header is returned
-            // and we start draining the body stream below
-            let download = download_cancellable(&cancel_inner, storage.download(&remote_path))
+            let download = storage
+                .download(&remote_path)
                .await
                .with_context(|| {
                    format!(
@@ -100,33 +94,12 @@ pub async fn download_layer_file<'a>(

            let mut reader = tokio_util::io::StreamReader::new(download.download_stream);

-            // Cancellation safety: it is safe to cancel this future because it is writing into a temporary file,
-            // and we will unlink the temporary file if there is an error.  This unlink is important because we
-            // are in a retry loop, and we wouldn't want to leave behind a rogue write I/O to a file that
-            // we will imminiently try and write to again.
-            let bytes_amount: u64 = match timeout_cancellable(
-                DOWNLOAD_TIMEOUT,
-                &cancel_inner,
+            let bytes_amount = tokio::time::timeout(
+                MAX_DOWNLOAD_DURATION,
                tokio::io::copy_buf(&mut reader, &mut destination_file),
            )
            .await
-            .with_context(|| {
-                format!(
-                    "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
-                )
-            })
-            .map_err(DownloadError::Other)?
-            {
-                Ok(b) => Ok(b),
-                Err(e) => {
-                    // Remove incomplete files: on restart Timeline would do this anyway, but we must
-                    // do it here for the retry case.
-                    if let Err(e) = tokio::fs::remove_file(&temp_file_path).await {
-                        on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
-                    }
-                    Err(e)
-                }
-            }
+            .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out  {:?}", e)))?
            .with_context(|| {
                format!(
                    "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
@@ -139,7 +112,6 @@ pub async fn download_layer_file<'a>(
            Ok((destination_file, bytes_amount))
        },
        &format!("download {remote_path:?}"),
-        cancel,
    )
    .await?;

@@ -216,14 +188,8 @@ pub async fn list_remote_timelines(
        anyhow::bail!("storage-sync-list-remote-timelines");
    });

-    let cancel_inner = cancel.clone();
    let listing = download_retry_forever(
-        || {
-            download_cancellable(
-                &cancel_inner,
-                storage.list(Some(&remote_path), ListingMode::WithDelimiter),
-            )
-        },
+        || storage.list(Some(&remote_path), ListingMode::WithDelimiter),
        &format!("list timelines for {tenant_shard_id}"),
        cancel,
    )
@@ -264,13 +230,9 @@ async fn do_download_index_part(

    let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);

-    let cancel_inner = cancel.clone();
    let index_part_bytes = download_retry_forever(
        || async {
-            // Cancellation: if is safe to cancel this future because we're just downloading into
-            // a memory buffer, not touching local disk.
-            let index_part_download =
-                download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;
+            let index_part_download = storage.download(&remote_path).await?;

            let mut index_part_bytes = Vec::new();
            let mut stream = std::pin::pin!(index_part_download.download_stream);
@@ -385,7 +347,10 @@ pub(super) async fn download_index_part(
        FAILED_DOWNLOAD_WARN_THRESHOLD,
        FAILED_REMOTE_OP_RETRIES,
        "listing index_part files",
-        backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
+        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+        backoff::Cancel::new(CancellationToken::new(), || -> anyhow::Error {
+            unreachable!()
+        }),
    )
    .await
    .map_err(DownloadError::Other)?;
@@ -424,7 +389,6 @@ pub(crate) async fn download_initdb_tar_zst(
    storage: &GenericRemoteStorage,
    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
-    cancel: &CancellationToken,
 ) -> Result<(Utf8PathBuf, File), DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

@@ -442,8 +406,6 @@ pub(crate) async fn download_initdb_tar_zst(
        "{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}"
    ));

-    let cancel_inner = cancel.clone();
-
    let file = download_retry(
        || async {
            let file = OpenOptions::new()
@@ -456,14 +418,10 @@ pub(crate) async fn download_initdb_tar_zst(
                .with_context(|| format!("tempfile creation {temp_path}"))
                .map_err(DownloadError::Other)?;

-            let download =
-                download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;
+            let download = storage.download(&remote_path).await?;
            let mut download = tokio_util::io::StreamReader::new(download.download_stream);
            let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);

-            // TODO: this consumption of the response body should be subject to timeout + cancellation, but
-            // not without thinking carefully about how to recover safely from cancelling a write to
-            // local storage (e.g. by writing into a temp file as we do in download_layer)
            tokio::io::copy_buf(&mut download, &mut writer)
                .await
                .with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
@@ -479,7 +437,6 @@ pub(crate) async fn download_initdb_tar_zst(
            Ok(file)
        },
        &format!("download {remote_path}"),
-        cancel,
    )
    .await
    .map_err(|e| {
@@ -503,11 +460,7 @@ pub(crate) async fn download_initdb_tar_zst(
 /// with backoff.
 ///
 /// (See similar logic for uploads in `perform_upload_task`)
-async fn download_retry<T, O, F>(
-    op: O,
-    description: &str,
-    cancel: &CancellationToken,
-) -> Result<T, DownloadError>
+async fn download_retry<T, O, F>(op: O, description: &str) -> Result<T, DownloadError>
 where
    O: FnMut() -> F,
    F: Future<Output = Result<T, DownloadError>>,
@@ -518,7 +471,10 @@ where
        FAILED_DOWNLOAD_WARN_THRESHOLD,
        FAILED_REMOTE_OP_RETRIES,
        description,
-        backoff::Cancel::new(cancel.clone(), || DownloadError::Cancelled),
+        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+        backoff::Cancel::new(CancellationToken::new(), || -> DownloadError {
+            unreachable!()
+        }),
    )
    .await
 }
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -7,14 +7,12 @@ use pageserver_api::shard::TenantShardId;
 use std::io::{ErrorKind, SeekFrom};
 use tokio::fs::{self, File};
 use tokio::io::AsyncSeekExt;
-use tokio_util::sync::CancellationToken;

 use super::Generation;
 use crate::{
    config::PageServerConf,
    tenant::remote_timeline_client::{
        index::IndexPart, remote_index_path, remote_initdb_archive_path, remote_path,
-        upload_cancellable,
    },
 };
 use remote_storage::GenericRemoteStorage;
@@ -31,7 +29,6 @@ pub(super) async fn upload_index_part<'a>(
    timeline_id: &TimelineId,
    generation: Generation,
    index_part: &'a IndexPart,
-    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    tracing::trace!("uploading new index part");

@@ -47,16 +44,14 @@ pub(super) async fn upload_index_part<'a>(
    let index_part_bytes = bytes::Bytes::from(index_part_bytes);

    let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
-    upload_cancellable(
-        cancel,
-        storage.upload_storage_object(
+    storage
+        .upload_storage_object(
            futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
            index_part_size,
            &remote_path,
-        ),
-    )
-    .await
-    .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
+        )
+        .await
+        .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
 }

 /// Attempts to upload given layer files.
@@ -69,7 +64,6 @@ pub(super) async fn upload_timeline_layer<'a>(
    source_path: &'a Utf8Path,
    known_metadata: &'a LayerFileMetadata,
    generation: Generation,
-    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    fail_point!("before-upload-layer", |_| {
        bail!("failpoint before-upload-layer")
@@ -113,7 +107,8 @@ pub(super) async fn upload_timeline_layer<'a>(

    let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);

-    upload_cancellable(cancel, storage.upload(reader, fs_size, &storage_path, None))
+    storage
+        .upload(reader, fs_size, &storage_path, None)
        .await
        .with_context(|| format!("upload layer from local path '{source_path}'"))?;

@@ -127,7 +122,6 @@ pub(crate) async fn upload_initdb_dir(
    timeline_id: &TimelineId,
    mut initdb_tar_zst: File,
    size: u64,
-    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    tracing::trace!("uploading initdb dir");

@@ -137,10 +131,8 @@ pub(crate) async fn upload_initdb_dir(
    let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);

    let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
-    upload_cancellable(
-        cancel,
-        storage.upload_storage_object(file, size as usize, &remote_path),
-    )
-    .await
-    .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
+    storage
+        .upload_storage_object(file, size as usize, &remote_path)
+        .await
+        .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -259,9 +259,8 @@ impl Layer {

        layer
            .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
-            .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
+            .instrument(tracing::info_span!("get_value_reconstruct_data", layer=%self))
            .await
-            .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
    }

    /// Download the layer if evicted.
@@ -655,6 +654,7 @@ impl LayerInner {
    }

    /// Cancellation safe.
+    #[tracing::instrument(skip_all, fields(layer=%self))]
    async fn get_or_maybe_download(
        self: &Arc<Self>,
        allow_download: bool,
@@ -663,101 +663,95 @@ impl LayerInner {
        let mut init_permit = None;

        loop {
-            let download = move |permit| {
-                async move {
-                    // disable any scheduled but not yet running eviction deletions for this
-                    let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
+            let download = move |permit| async move {
+                // disable any scheduled but not yet running eviction deletions for this
+                let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);

-                    // count cancellations, which currently remain largely unexpected
-                    let init_cancelled =
-                        scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+                // count cancellations, which currently remain largely unexpected
+                let init_cancelled =
+                    scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());

-                    // no need to make the evict_and_wait wait for the actual download to complete
-                    drop(self.status.send(Status::Downloaded));
+                // no need to make the evict_and_wait wait for the actual download to complete
+                drop(self.status.send(Status::Downloaded));

-                    let timeline = self
-                        .timeline
-                        .upgrade()
-                        .ok_or_else(|| DownloadError::TimelineShutdown)?;
+                let timeline = self
+                    .timeline
+                    .upgrade()
+                    .ok_or_else(|| DownloadError::TimelineShutdown)?;

-                    // FIXME: grab a gate
+                // FIXME: grab a gate

-                    let can_ever_evict = timeline.remote_client.as_ref().is_some();
+                let can_ever_evict = timeline.remote_client.as_ref().is_some();

-                    // check if we really need to be downloaded; could have been already downloaded by a
-                    // cancelled previous attempt.
-                    let needs_download = self
-                        .needs_download()
-                        .await
-                        .map_err(DownloadError::PreStatFailed)?;
+                // check if we really need to be downloaded; could have been already downloaded by a
+                // cancelled previous attempt.
+                let needs_download = self
+                    .needs_download()
+                    .await
+                    .map_err(DownloadError::PreStatFailed)?;

-                    let permit = if let Some(reason) = needs_download {
-                        if let NeedsDownload::NotFile(ft) = reason {
-                            return Err(DownloadError::NotFile(ft));
-                        }
-
-                        // only reset this after we've decided we really need to download. otherwise it'd
-                        // be impossible to mark cancelled downloads for eviction, like one could imagine
-                        // we would like to do for prefetching which was not needed.
-                        self.wanted_evicted.store(false, Ordering::Release);
-
-                        if !can_ever_evict {
-                            return Err(DownloadError::NoRemoteStorage);
-                        }
-
-                        if let Some(ctx) = ctx {
-                            self.check_expected_download(ctx)?;
-                        }
-
-                        if !allow_download {
-                            // this does look weird, but for LayerInner the "downloading" means also changing
-                            // internal once related state ...
-                            return Err(DownloadError::DownloadRequired);
-                        }
-
-                        tracing::info!(%reason, "downloading on-demand");
-
-                        self.spawn_download_and_wait(timeline, permit).await?
-                    } else {
-                        // the file is present locally, probably by a previous but cancelled call to
-                        // get_or_maybe_download. alternatively we might be running without remote storage.
-                        LAYER_IMPL_METRICS.inc_init_needed_no_download();
-
-                        permit
-                    };
-
-                    let since_last_eviction =
-                        self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
-                    if let Some(since_last_eviction) = since_last_eviction {
-                        // FIXME: this will not always be recorded correctly until #6028 (the no
-                        // download needed branch above)
-                        LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
+                let permit = if let Some(reason) = needs_download {
+                    if let NeedsDownload::NotFile(ft) = reason {
+                        return Err(DownloadError::NotFile(ft));
                    }

-                    let res = Arc::new(DownloadedLayer {
-                        owner: Arc::downgrade(self),
-                        kind: tokio::sync::OnceCell::default(),
-                        version: next_version,
-                    });
+                    // only reset this after we've decided we really need to download. otherwise it'd
+                    // be impossible to mark cancelled downloads for eviction, like one could imagine
+                    // we would like to do for prefetching which was not needed.
+                    self.wanted_evicted.store(false, Ordering::Release);

-                    self.access_stats.record_residence_event(
-                        LayerResidenceStatus::Resident,
-                        LayerResidenceEventReason::ResidenceChange,
-                    );
-
-                    let waiters = self.inner.initializer_count();
-                    if waiters > 0 {
-                        tracing::info!(
-                            waiters,
-                            "completing the on-demand download for other tasks"
-                        );
+                    if !can_ever_evict {
+                        return Err(DownloadError::NoRemoteStorage);
                    }

-                    scopeguard::ScopeGuard::into_inner(init_cancelled);
+                    if let Some(ctx) = ctx {
+                        self.check_expected_download(ctx)?;
+                    }

-                    Ok((ResidentOrWantedEvicted::Resident(res), permit))
+                    if !allow_download {
+                        // this does look weird, but for LayerInner the "downloading" means also changing
+                        // internal once related state ...
+                        return Err(DownloadError::DownloadRequired);
+                    }
+
+                    tracing::info!(%reason, "downloading on-demand");
+
+                    self.spawn_download_and_wait(timeline, permit).await?
+                } else {
+                    // the file is present locally, probably by a previous but cancelled call to
+                    // get_or_maybe_download. alternatively we might be running without remote storage.
+                    LAYER_IMPL_METRICS.inc_init_needed_no_download();
+
+                    permit
+                };
+
+                let since_last_eviction =
+                    self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
+                if let Some(since_last_eviction) = since_last_eviction {
+                    // FIXME: this will not always be recorded correctly until #6028 (the no
+                    // download needed branch above)
+                    LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
                }
-                .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
+
+                let res = Arc::new(DownloadedLayer {
+                    owner: Arc::downgrade(self),
+                    kind: tokio::sync::OnceCell::default(),
+                    version: next_version,
+                });
+
+                self.access_stats.record_residence_event(
+                    LayerResidenceStatus::Resident,
+                    LayerResidenceEventReason::ResidenceChange,
+                );
+
+                let waiters = self.inner.initializer_count();
+                if waiters > 0 {
+                    tracing::info!(waiters, "completing the on-demand download for other tasks");
+                }
+
+                scopeguard::ScopeGuard::into_inner(init_cancelled);
+
+                Ok((ResidentOrWantedEvicted::Resident(res), permit))
            };

            if let Some(init_permit) = init_permit.take() {
@@ -868,7 +862,6 @@ impl LayerInner {
                let result = client.download_layer_file(
                    &this.desc.filename(),
                    &this.metadata(),
-                    &crate::task_mgr::shutdown_token()
                )
                .await;

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1734,7 +1734,6 @@ impl Timeline {
                self.current_logical_size.current_size().accuracy(),
                logical_size::Accuracy::Exact,
            );
-            self.current_logical_size.initialized.add_permits(1);
            return;
        };

@@ -1780,11 +1779,6 @@ impl Timeline {
        cancel: CancellationToken,
        background_ctx: RequestContext,
    ) {
-        scopeguard::defer! {
-            // Irrespective of the outcome of this operation, we should unblock anyone waiting for it.
-            self.current_logical_size.initialized.add_permits(1);
-        }
-
        enum BackgroundCalculationError {
            Cancelled,
            Other(anyhow::Error),
@@ -3110,32 +3104,6 @@ impl Timeline {

        Ok(image_layers)
    }
-
-    /// Wait until the background initial logical size calculation is complete, or
-    /// this Timeline is shut down.  Calling this function will cause the initial
-    /// logical size calculation to skip waiting for the background jobs barrier.
-    pub(crate) async fn await_initial_logical_size(self: Arc<Self>) {
-        if let Some(await_bg_cancel) = self
-            .current_logical_size
-            .cancel_wait_for_background_loop_concurrency_limit_semaphore
-            .get()
-        {
-            await_bg_cancel.cancel();
-        } else {
-            // We should not wait if we were not able to explicitly instruct
-            // the logical size cancellation to skip the concurrency limit semaphore.
-            // TODO: this is an unexpected case.  We should restructure so that it
-            // can't happen.
-            tracing::info!(
-                "await_initial_logical_size: can't get semaphore cancel token, skipping"
-            );
-        }
-
-        tokio::select!(
-            _ = self.current_logical_size.initialized.acquire() => {},
-            _ = self.cancel.cancelled() => {}
-        )
-    }
 }

 #[derive(Default)]
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -34,9 +34,6 @@ pub(super) struct LogicalSize {
    pub(crate) cancel_wait_for_background_loop_concurrency_limit_semaphore:
        OnceCell<CancellationToken>,

-    /// Once the initial logical size is initialized, this is notified.
-    pub(crate) initialized: tokio::sync::Semaphore,
-
    /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
    pub initial_part_end: Option<Lsn>,

@@ -128,7 +125,6 @@ impl LogicalSize {
            initial_part_end: None,
            size_added_after_initial: AtomicI64::new(0),
            did_return_approximate_to_walreceiver: AtomicBool::new(false),
-            initialized: tokio::sync::Semaphore::new(0),
        }
    }

@@ -139,7 +135,6 @@ impl LogicalSize {
            initial_part_end: Some(compute_to),
            size_added_after_initial: AtomicI64::new(0),
            did_return_approximate_to_walreceiver: AtomicBool::new(false),
-            initialized: tokio::sync::Semaphore::new(0),
        }
    }

--- a/pgxn/neon/neon_utils.c
+++ b/pgxn/neon/neon_utils.c
@@ -1,6 +1,3 @@
-
-#include <sys/resource.h>
-
 #include "postgres.h"

 #include "access/timeline.h"
@@ -117,25 +114,3 @@ pq_sendint64_le(StringInfo buf, uint64 i)
 	memcpy(buf->data + buf->len, &i, sizeof(uint64));
 	buf->len += sizeof(uint64);
 }
-
-/*
- * Disables core dump for the current process.
- */
-void
-disable_core_dump()
-{
-	struct rlimit rlim;
-
-#ifdef WALPROPOSER_LIB			/* skip in simulation mode */
-	return;
-#endif
-
-	rlim.rlim_cur = 0;
-	rlim.rlim_max = 0;
-	if (setrlimit(RLIMIT_CORE, &rlim))
-	{
-		int			save_errno = errno;
-
-		fprintf(stderr, "WARNING: disable cores setrlimit failed: %s", strerror(save_errno));
-	}
-}
--- a/pgxn/neon/neon_utils.h
+++ b/pgxn/neon/neon_utils.h
@@ -8,6 +8,5 @@ uint32		pq_getmsgint32_le(StringInfo msg);
 uint64		pq_getmsgint64_le(StringInfo msg);
 void		pq_sendint32_le(StringInfo buf, uint32 i);
 void		pq_sendint64_le(StringInfo buf, uint64 i);
-extern void disable_core_dump();

 #endif							/* __NEON_UTILS_H__ */
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -35,8 +35,6 @@
 *
 *-------------------------------------------------------------------------
 */
-#include <sys/resource.h>
-
 #include "postgres.h"
 #include "libpq/pqformat.h"
 #include "neon.h"
@@ -1071,12 +1069,6 @@ DetermineEpochStartLsn(WalProposer *wp)
 			if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
 											walprop_shared->mineLastElectedTerm)))
 			{
-				/*
-				 * Panic to restart PG as we need to retake basebackup.
-				 * However, don't dump core as this is kinda expected
-				 * scenario.
-				 */
-				disable_core_dump();
 				walprop_log(PANIC,
 							"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
 							LSN_FORMAT_ARGS(wp->propEpochStartLsn),
@@ -1453,12 +1445,7 @@ RecvAppendResponses(Safekeeper *sk)

 		if (sk->appendResponse.term > wp->propTerm)
 		{
-			/*
-			 * Another compute with higher term is running. Panic to restart
-			 * PG as we likely need to retake basebackup. However, don't dump
-			 * core as this is kinda expected scenario.
-			 */
-			disable_core_dump();
+			/* Another compute with higher term is running. */
 			walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
 						sk->host, sk->port,
 						sk->appendResponse.term, wp->propTerm);
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -11,8 +11,7 @@ use crate::auth::validate_password_and_exchange;
 use crate::console::errors::GetAuthInfoError;
 use crate::console::provider::AuthInfo;
 use crate::console::AuthSecret;
-use crate::proxy::connect_compute::handle_try_wake;
-use crate::proxy::retry::retry_after;
+use crate::proxy::{handle_try_wake, retry_after, LatencyTimer};
 use crate::scram;
 use crate::stream::Stream;
 use crate::{
@@ -23,7 +22,6 @@ use crate::{
        provider::{CachedNodeInfo, ConsoleReqExtra},
        Api,
    },
-    metrics::LatencyTimer,
    stream, url,
 };
 use futures::TryFutureExt;
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -4,7 +4,7 @@ use crate::{
    compute,
    config::AuthenticationConfig,
    console::AuthSecret,
-    metrics::LatencyTimer,
+    proxy::LatencyTimer,
    sasl,
    stream::{PqStream, Stream},
 };
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -4,7 +4,7 @@ use super::{
 use crate::{
    auth::{self, AuthFlow},
    console::AuthSecret,
-    metrics::LatencyTimer,
+    proxy::LatencyTimer,
    sasl,
    stream::{self, Stream},
 };
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,8 +1,9 @@
 //! User credentials used in authentication.

 use crate::{
-    auth::password_hack::parse_endpoint_param, error::UserFacingError,
-    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::neon_options_str,
+    auth::password_hack::parse_endpoint_param,
+    error::UserFacingError,
+    proxy::{neon_options_str, NUM_CONNECTION_ACCEPTED_BY_SNI},
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,6 +1,9 @@
 use crate::{
-    auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError,
-    error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE, proxy::neon_option,
+    auth::parse_endpoint_param,
+    cancellation::CancelClosure,
+    console::errors::WakeComputeError,
+    error::UserFacingError,
+    proxy::{neon_option, NUM_DB_CONNECTIONS_GAUGE},
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -21,7 +21,7 @@ pub mod errors {
    use crate::{
        error::{io_error, UserFacingError},
        http,
-        proxy::retry::ShouldRetry,
+        proxy::ShouldRetry,
    };
    use thiserror::Error;

--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -5,7 +5,7 @@ use super::{
    errors::{ApiError, GetAuthInfoError, WakeComputeError},
    ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
 };
-use crate::metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER};
+use crate::proxy::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER};
 use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -4,16 +4,14 @@

 pub mod health_server;

-use std::{sync::Arc, time::Duration};
+use std::time::Duration;

-use futures::FutureExt;
 pub use reqwest::{Request, Response, StatusCode};
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio::time::Instant;
-use tracing::trace;

-use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl};
+use crate::{proxy::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl};
 use reqwest_middleware::RequestBuilder;

 /// This is the preferred way to create new http clients,
@@ -21,7 +19,7 @@ use reqwest_middleware::RequestBuilder;
 /// We deliberately don't want to replace this with a public static.
 pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> ClientWithMiddleware {
    let client = reqwest::ClientBuilder::new()
-        .dns_resolver(Arc::new(GaiResolver::default()))
+        .http2_prior_knowledge()
        .connection_verbose(true)
        .build()
        .expect("Failed to create http client");
@@ -34,7 +32,6 @@ pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> Clien

 pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
    let timeout_client = reqwest::ClientBuilder::new()
-        .dns_resolver(Arc::new(GaiResolver::default()))
        .connection_verbose(true)
        .timeout(default_timout)
        .build()
@@ -100,37 +97,6 @@ impl Endpoint {
    }
 }

-/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
-use hyper::{
-    client::connect::dns::{GaiResolver as HyperGaiResolver, Name},
-    service::Service,
-};
-use reqwest::dns::{Addrs, Resolve, Resolving};
-#[derive(Debug)]
-pub struct GaiResolver(HyperGaiResolver);
-
-impl Default for GaiResolver {
-    fn default() -> Self {
-        Self(HyperGaiResolver::new())
-    }
-}
-
-impl Resolve for GaiResolver {
-    fn resolve(&self, name: Name) -> Resolving {
-        let this = &mut self.0.clone();
-        let start = Instant::now();
-        Box::pin(
-            Service::<Name>::call(this, name.clone()).map(move |result| {
-                let resolve_duration = start.elapsed();
-                trace!(duration = ?resolve_duration, addr = %name, "resolve host complete");
-                result
-                    .map(|addrs| -> Addrs { Box::new(addrs) })
-                    .map_err(|err| -> Box<dyn std::error::Error + Send + Sync> { Box::new(err) })
-            }),
-        )
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -16,7 +16,6 @@ pub mod console;
 pub mod error;
 pub mod http;
 pub mod logging;
-pub mod metrics;
 pub mod parse;
 pub mod protocol2;
 pub mod proxy;
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,232 +0,0 @@
-use ::metrics::{
-    exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec,
-    IntCounterPairVec, IntCounterVec,
-};
-use prometheus::{
-    register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec,
-    IntGaugeVec,
-};
-
-use once_cell::sync::Lazy;
-use tokio::time;
-
-pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "proxy_opened_db_connections_total",
-        "Number of opened connections to a database.",
-        "proxy_closed_db_connections_total",
-        "Number of closed connections to a database.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "proxy_opened_client_connections_total",
-        "Number of opened connections from a client.",
-        "proxy_closed_client_connections_total",
-        "Number of closed connections from a client.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "proxy_accepted_connections_total",
-        "Number of client connections accepted.",
-        "proxy_closed_connections_total",
-        "Number of client connections closed.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-pub static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "proxy_compute_connection_latency_seconds",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // http/ws/tcp, true/false, true/false, success/failure
-        // 3 * 2 * 2 * 2 = 24 counters
-        &["protocol", "cache_miss", "pool_miss", "outcome"],
-        // largest bucket = 2^16 * 0.5ms = 32s
-        exponential_buckets(0.0005, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
-
-pub static CONSOLE_REQUEST_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "proxy_console_request_latency",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // proxy_wake_compute/proxy_get_role_info
-        &["request"],
-        // largest bucket = 2^16 * 0.2ms = 13s
-        exponential_buckets(0.0002, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
-
-pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_allowed_ips_cache_misses",
-        "Number of cache hits/misses for allowed ips",
-        // hit/miss
-        &["outcome"],
-    )
-    .unwrap()
-});
-
-pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_control_plane_token_acquire_seconds",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // largest bucket = 3^16 * 0.05ms = 2.15s
-        exponential_buckets(0.00005, 3.0, 16).unwrap(),
-    )
-    .unwrap()
-});
-
-pub static RATE_LIMITER_LIMIT: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "semaphore_control_plane_limit",
-        "Current limit of the semaphore control plane",
-        &["limit"], // 2 counters
-    )
-    .unwrap()
-});
-
-pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_accepted_connections_by_sni",
-        "Number of connections (per sni).",
-        &["kind"],
-    )
-    .unwrap()
-});
-
-pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_allowed_ips_number",
-        "Number of allowed ips",
-        vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0],
-    )
-    .unwrap()
-});
-
-pub struct LatencyTimer {
-    // time since the stopwatch was started
-    start: Option<time::Instant>,
-    // accumulated time on the stopwatch
-    accumulated: std::time::Duration,
-    // label data
-    protocol: &'static str,
-    cache_miss: bool,
-    pool_miss: bool,
-    outcome: &'static str,
-}
-
-pub struct LatencyTimerPause<'a> {
-    timer: &'a mut LatencyTimer,
-}
-
-impl LatencyTimer {
-    pub fn new(protocol: &'static str) -> Self {
-        Self {
-            start: Some(time::Instant::now()),
-            accumulated: std::time::Duration::ZERO,
-            protocol,
-            cache_miss: false,
-            // by default we don't do pooling
-            pool_miss: true,
-            // assume failed unless otherwise specified
-            outcome: "failed",
-        }
-    }
-
-    pub fn pause(&mut self) -> LatencyTimerPause<'_> {
-        // stop the stopwatch and record the time that we have accumulated
-        let start = self.start.take().expect("latency timer should be started");
-        self.accumulated += start.elapsed();
-        LatencyTimerPause { timer: self }
-    }
-
-    pub fn cache_miss(&mut self) {
-        self.cache_miss = true;
-    }
-
-    pub fn pool_hit(&mut self) {
-        self.pool_miss = false;
-    }
-
-    pub fn success(mut self) {
-        self.outcome = "success";
-    }
-}
-
-impl Drop for LatencyTimerPause<'_> {
-    fn drop(&mut self) {
-        // start the stopwatch again
-        self.timer.start = Some(time::Instant::now());
-    }
-}
-
-impl Drop for LatencyTimer {
-    fn drop(&mut self) {
-        let duration =
-            self.start.map(|start| start.elapsed()).unwrap_or_default() + self.accumulated;
-        COMPUTE_CONNECTION_LATENCY
-            .with_label_values(&[
-                self.protocol,
-                bool_to_str(self.cache_miss),
-                bool_to_str(self.pool_miss),
-                self.outcome,
-            ])
-            .observe(duration.as_secs_f64())
-    }
-}
-
-pub static NUM_CONNECTION_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_connection_failures_total",
-        "Number of connection failures (per kind).",
-        &["kind"],
-    )
-    .unwrap()
-});
-
-pub static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_connection_failures_breakdown",
-        "Number of wake-up failures (per kind).",
-        &["retry", "kind"],
-    )
-    .unwrap()
-});
-
-pub static NUM_BYTES_PROXIED_PER_CLIENT_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_io_bytes_per_client",
-        "Number of bytes sent/received between client and backend.",
-        crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS,
-    )
-    .unwrap()
-});
-
-pub static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_io_bytes",
-        "Number of bytes sent/received between all clients and backends.",
-        &["direction"],
-    )
-    .unwrap()
-});
-
-pub const fn bool_to_str(x: bool) -> &'static str {
-    if x {
-        "true"
-    } else {
-        "false"
-    }
-}
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -1,41 +1,265 @@
 #[cfg(test)]
 mod tests;

-pub mod connect_compute;
-pub mod retry;
-
 use crate::{
    auth,
    cancellation::{self, CancelMap},
-    compute,
+    compute::{self, PostgresConnection},
    config::{AuthenticationConfig, ProxyConfig, TlsConfig},
-    console::{self, messages::MetricsAuxInfo},
-    metrics::{
-        LatencyTimer, NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER,
-        NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE,
-    },
+    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
+    http::StatusCode,
    protocol2::WithClientIp,
    rate_limiter::EndpointRateLimiter,
    stream::{PqStream, Stream},
    usage_metrics::{Ids, USAGE_METRICS},
 };
 use anyhow::{bail, Context};
+use async_trait::async_trait;
 use futures::TryFutureExt;
 use itertools::Itertools;
-use once_cell::sync::OnceCell;
+use metrics::{
+    exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec,
+    IntCounterPairVec, IntCounterVec,
+};
+use once_cell::sync::{Lazy, OnceCell};
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
+use prometheus::{
+    register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec,
+    IntGaugeVec,
+};
 use regex::Regex;
-use std::{net::IpAddr, sync::Arc};
-use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
+use std::{error::Error, io, net::IpAddr, ops::ControlFlow, sync::Arc, time::Instant};
+use tokio::{
+    io::{AsyncRead, AsyncWrite, AsyncWriteExt},
+    time,
+};
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, info_span, Instrument};
+use tracing::{error, info, info_span, warn, Instrument};
 use utils::measured_stream::MeasuredStream;

-use self::connect_compute::{connect_to_compute, TcpMechanism};
+/// Number of times we should retry the `/proxy_wake_compute` http request.
+/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0
+pub const NUM_RETRIES_CONNECT: u32 = 16;
+const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
+const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25);
+const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;

 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";

+pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "proxy_opened_db_connections_total",
+        "Number of opened connections to a database.",
+        "proxy_closed_db_connections_total",
+        "Number of closed connections to a database.",
+        &["protocol"],
+    )
+    .unwrap()
+});
+
+pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "proxy_opened_client_connections_total",
+        "Number of opened connections from a client.",
+        "proxy_closed_client_connections_total",
+        "Number of closed connections from a client.",
+        &["protocol"],
+    )
+    .unwrap()
+});
+
+pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "proxy_accepted_connections_total",
+        "Number of client connections accepted.",
+        "proxy_closed_connections_total",
+        "Number of client connections closed.",
+        &["protocol"],
+    )
+    .unwrap()
+});
+
+static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "proxy_compute_connection_latency_seconds",
+        "Time it took for proxy to establish a connection to the compute endpoint",
+        // http/ws/tcp, true/false, true/false, success/failure
+        // 3 * 2 * 2 * 2 = 24 counters
+        &["protocol", "cache_miss", "pool_miss", "outcome"],
+        // largest bucket = 2^16 * 0.5ms = 32s
+        exponential_buckets(0.0005, 2.0, 16).unwrap(),
+    )
+    .unwrap()
+});
+
+pub static CONSOLE_REQUEST_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "proxy_console_request_latency",
+        "Time it took for proxy to establish a connection to the compute endpoint",
+        // proxy_wake_compute/proxy_get_role_info
+        &["request"],
+        // largest bucket = 2^16 * 0.2ms = 13s
+        exponential_buckets(0.0002, 2.0, 16).unwrap(),
+    )
+    .unwrap()
+});
+
+pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_allowed_ips_cache_misses",
+        "Number of cache hits/misses for allowed ips",
+        // hit/miss
+        &["outcome"],
+    )
+    .unwrap()
+});
+
+pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "proxy_control_plane_token_acquire_seconds",
+        "Time it took for proxy to establish a connection to the compute endpoint",
+        // largest bucket = 3^16 * 0.05ms = 2.15s
+        exponential_buckets(0.00005, 3.0, 16).unwrap(),
+    )
+    .unwrap()
+});
+
+pub static RATE_LIMITER_LIMIT: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "semaphore_control_plane_limit",
+        "Current limit of the semaphore control plane",
+        &["limit"], // 2 counters
+    )
+    .unwrap()
+});
+
+pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_accepted_connections_by_sni",
+        "Number of connections (per sni).",
+        &["kind"],
+    )
+    .unwrap()
+});
+
+pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "proxy_allowed_ips_number",
+        "Number of allowed ips",
+        vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0],
+    )
+    .unwrap()
+});
+
+pub struct LatencyTimer {
+    // time since the stopwatch was started
+    start: Option<Instant>,
+    // accumulated time on the stopwatch
+    accumulated: std::time::Duration,
+    // label data
+    protocol: &'static str,
+    cache_miss: bool,
+    pool_miss: bool,
+    outcome: &'static str,
+}
+
+pub struct LatencyTimerPause<'a> {
+    timer: &'a mut LatencyTimer,
+}
+
+impl LatencyTimer {
+    pub fn new(protocol: &'static str) -> Self {
+        Self {
+            start: Some(Instant::now()),
+            accumulated: std::time::Duration::ZERO,
+            protocol,
+            cache_miss: false,
+            // by default we don't do pooling
+            pool_miss: true,
+            // assume failed unless otherwise specified
+            outcome: "failed",
+        }
+    }
+
+    pub fn pause(&mut self) -> LatencyTimerPause<'_> {
+        // stop the stopwatch and record the time that we have accumulated
+        let start = self.start.take().expect("latency timer should be started");
+        self.accumulated += start.elapsed();
+        LatencyTimerPause { timer: self }
+    }
+
+    pub fn cache_miss(&mut self) {
+        self.cache_miss = true;
+    }
+
+    pub fn pool_hit(&mut self) {
+        self.pool_miss = false;
+    }
+
+    pub fn success(mut self) {
+        self.outcome = "success";
+    }
+}
+
+impl Drop for LatencyTimerPause<'_> {
+    fn drop(&mut self) {
+        // start the stopwatch again
+        self.timer.start = Some(Instant::now());
+    }
+}
+
+impl Drop for LatencyTimer {
+    fn drop(&mut self) {
+        let duration =
+            self.start.map(|start| start.elapsed()).unwrap_or_default() + self.accumulated;
+        COMPUTE_CONNECTION_LATENCY
+            .with_label_values(&[
+                self.protocol,
+                bool_to_str(self.cache_miss),
+                bool_to_str(self.pool_miss),
+                self.outcome,
+            ])
+            .observe(duration.as_secs_f64())
+    }
+}
+
+static NUM_CONNECTION_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_connection_failures_total",
+        "Number of connection failures (per kind).",
+        &["kind"],
+    )
+    .unwrap()
+});
+
+static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_connection_failures_breakdown",
+        "Number of wake-up failures (per kind).",
+        &["retry", "kind"],
+    )
+    .unwrap()
+});
+
+static NUM_BYTES_PROXIED_PER_CLIENT_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_io_bytes_per_client",
+        "Number of bytes sent/received between client and backend.",
+        crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS,
+    )
+    .unwrap()
+});
+
+static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_io_bytes",
+        "Number of bytes sent/received between all clients and backends.",
+        &["direction"],
+    )
+    .unwrap()
+});
+
 pub async fn run_until_cancelled<F: std::future::Future>(
    f: F,
    cancellation_token: &CancellationToken,
@@ -315,6 +539,296 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
    }
 }

+/// If we couldn't connect, a cached connection info might be to blame
+/// (e.g. the compute node's address might've changed at the wrong time).
+/// Invalidate the cache entry (if any) to prevent subsequent errors.
+#[tracing::instrument(name = "invalidate_cache", skip_all)]
+pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg {
+    let is_cached = node_info.cached();
+    if is_cached {
+        warn!("invalidating stalled compute node info cache entry");
+    }
+    let label = match is_cached {
+        true => "compute_cached",
+        false => "compute_uncached",
+    };
+    NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
+
+    node_info.invalidate().config
+}
+
+/// Try to connect to the compute node once.
+#[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)]
+async fn connect_to_compute_once(
+    node_info: &console::CachedNodeInfo,
+    timeout: time::Duration,
+    proto: &'static str,
+) -> Result<PostgresConnection, compute::ConnectionError> {
+    let allow_self_signed_compute = node_info.allow_self_signed_compute;
+
+    node_info
+        .config
+        .connect(allow_self_signed_compute, timeout, proto)
+        .await
+}
+
+#[async_trait]
+pub trait ConnectMechanism {
+    type Connection;
+    type ConnectError;
+    type Error: From<Self::ConnectError>;
+    async fn connect_once(
+        &self,
+        node_info: &console::CachedNodeInfo,
+        timeout: time::Duration,
+    ) -> Result<Self::Connection, Self::ConnectError>;
+
+    fn update_connect_config(&self, conf: &mut compute::ConnCfg);
+}
+
+pub struct TcpMechanism<'a> {
+    /// KV-dictionary with PostgreSQL connection params.
+    pub params: &'a StartupMessageParams,
+    pub proto: &'static str,
+}
+
+#[async_trait]
+impl ConnectMechanism for TcpMechanism<'_> {
+    type Connection = PostgresConnection;
+    type ConnectError = compute::ConnectionError;
+    type Error = compute::ConnectionError;
+
+    async fn connect_once(
+        &self,
+        node_info: &console::CachedNodeInfo,
+        timeout: time::Duration,
+    ) -> Result<PostgresConnection, Self::Error> {
+        connect_to_compute_once(node_info, timeout, self.proto).await
+    }
+
+    fn update_connect_config(&self, config: &mut compute::ConnCfg) {
+        config.set_startup_params(self.params);
+    }
+}
+
+const fn bool_to_str(x: bool) -> &'static str {
+    if x {
+        "true"
+    } else {
+        "false"
+    }
+}
+
+fn report_error(e: &WakeComputeError, retry: bool) {
+    use crate::console::errors::ApiError;
+    let retry = bool_to_str(retry);
+    let kind = match e {
+        WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
+        WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::LOCKED,
+            ref text,
+        }) if text.contains("written data quota exceeded")
+            || text.contains("the limit for current plan reached") =>
+        {
+            "quota_exceeded"
+        }
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::LOCKED,
+            ..
+        }) => "api_console_locked",
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::BAD_REQUEST,
+            ..
+        }) => "api_console_bad_request",
+        WakeComputeError::ApiError(ApiError::Console { status, .. })
+            if status.is_server_error() =>
+        {
+            "api_console_other_server_error"
+        }
+        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
+        WakeComputeError::TimeoutError => "timeout_error",
+    };
+    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
+}
+
+/// Try to connect to the compute node, retrying if necessary.
+/// This function might update `node_info`, so we take it by `&mut`.
+#[tracing::instrument(skip_all)]
+pub async fn connect_to_compute<M: ConnectMechanism>(
+    mechanism: &M,
+    mut node_info: console::CachedNodeInfo,
+    extra: &console::ConsoleReqExtra,
+    creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>,
+    mut latency_timer: LatencyTimer,
+) -> Result<M::Connection, M::Error>
+where
+    M::ConnectError: ShouldRetry + std::fmt::Debug,
+    M::Error: From<WakeComputeError>,
+{
+    mechanism.update_connect_config(&mut node_info.config);
+
+    // try once
+    let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+        Ok(res) => {
+            latency_timer.success();
+            return Ok(res);
+        }
+        Err(e) => {
+            error!(error = ?e, "could not connect to compute node");
+            (invalidate_cache(node_info), e)
+        }
+    };
+
+    latency_timer.cache_miss();
+
+    let mut num_retries = 1;
+
+    // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
+    info!("compute node's state has likely changed; requesting a wake-up");
+    let node_info = loop {
+        let wake_res = match creds {
+            auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
+            #[cfg(feature = "testing")]
+            auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
+            // nothing to do?
+            auth::BackendType::Link(_) => return Err(err.into()),
+            // test backend
+            #[cfg(test)]
+            auth::BackendType::Test(x) => x.wake_compute(),
+        };
+
+        match handle_try_wake(wake_res, num_retries) {
+            Err(e) => {
+                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
+                report_error(&e, false);
+                return Err(e.into());
+            }
+            // failed to wake up but we can continue to retry
+            Ok(ControlFlow::Continue(e)) => {
+                report_error(&e, true);
+                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
+            }
+            // successfully woke up a compute node and can break the wakeup loop
+            Ok(ControlFlow::Break(mut node_info)) => {
+                node_info.config.reuse_password(&config);
+                mechanism.update_connect_config(&mut node_info.config);
+                break node_info;
+            }
+        }
+
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;
+
+        time::sleep(wait_duration).await;
+    };
+
+    // now that we have a new node, try connect to it repeatedly.
+    // this can error for a few reasons, for instance:
+    // * DNS connection settings haven't quite propagated yet
+    info!("wake_compute success. attempting to connect");
+    loop {
+        match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+            Ok(res) => {
+                latency_timer.success();
+                return Ok(res);
+            }
+            Err(e) => {
+                let retriable = e.should_retry(num_retries);
+                if !retriable {
+                    error!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
+                    return Err(e.into());
+                }
+                warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
+            }
+        }
+
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;
+
+        time::sleep(wait_duration).await;
+    }
+}
+
+/// Attempts to wake up the compute node.
+/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
+/// * Returns Ok(Break(node)) if the wakeup succeeded
+/// * Returns Err(e) if there was an error
+pub fn handle_try_wake(
+    result: Result<console::CachedNodeInfo, WakeComputeError>,
+    num_retries: u32,
+) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
+    match result {
+        Err(err) => match &err {
+            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
+                Ok(ControlFlow::Continue(err))
+            }
+            _ => Err(err),
+        },
+        // Ready to try again.
+        Ok(new) => Ok(ControlFlow::Break(new)),
+    }
+}
+
+pub trait ShouldRetry {
+    fn could_retry(&self) -> bool;
+    fn should_retry(&self, num_retries: u32) -> bool {
+        match self {
+            _ if num_retries >= NUM_RETRIES_CONNECT => false,
+            err => err.could_retry(),
+        }
+    }
+}
+
+impl ShouldRetry for io::Error {
+    fn could_retry(&self) -> bool {
+        use std::io::ErrorKind;
+        matches!(
+            self.kind(),
+            ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut
+        )
+    }
+}
+
+impl ShouldRetry for tokio_postgres::error::DbError {
+    fn could_retry(&self) -> bool {
+        use tokio_postgres::error::SqlState;
+        matches!(
+            self.code(),
+            &SqlState::CONNECTION_FAILURE
+                | &SqlState::CONNECTION_EXCEPTION
+                | &SqlState::CONNECTION_DOES_NOT_EXIST
+                | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
+        )
+    }
+}
+
+impl ShouldRetry for tokio_postgres::Error {
+    fn could_retry(&self) -> bool {
+        if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
+            io::Error::could_retry(io_err)
+        } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
+            tokio_postgres::error::DbError::could_retry(db_err)
+        } else {
+            false
+        }
+    }
+}
+
+impl ShouldRetry for compute::ConnectionError {
+    fn could_retry(&self) -> bool {
+        match self {
+            compute::ConnectionError::Postgres(err) => err.could_retry(),
+            compute::ConnectionError::CouldNotConnect(err) => err.could_retry(),
+            _ => false,
+        }
+    }
+}
+
+pub fn retry_after(num_retries: u32) -> time::Duration {
+    BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1))
+}
+
 /// Finish client connection initialization: confirm auth success, send params, etc.
 #[tracing::instrument(skip_all)]
 async fn prepare_client_connection(
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -1,238 +0,0 @@
-use crate::{
-    auth,
-    compute::{self, PostgresConnection},
-    console::{self, errors::WakeComputeError, Api},
-    metrics::{bool_to_str, LatencyTimer, NUM_CONNECTION_FAILURES, NUM_WAKEUP_FAILURES},
-    proxy::retry::{retry_after, ShouldRetry},
-};
-use async_trait::async_trait;
-use hyper::StatusCode;
-use pq_proto::StartupMessageParams;
-use std::ops::ControlFlow;
-use tokio::time;
-use tracing::{error, info, warn};
-
-const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
-
-/// If we couldn't connect, a cached connection info might be to blame
-/// (e.g. the compute node's address might've changed at the wrong time).
-/// Invalidate the cache entry (if any) to prevent subsequent errors.
-#[tracing::instrument(name = "invalidate_cache", skip_all)]
-pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg {
-    let is_cached = node_info.cached();
-    if is_cached {
-        warn!("invalidating stalled compute node info cache entry");
-    }
-    let label = match is_cached {
-        true => "compute_cached",
-        false => "compute_uncached",
-    };
-    NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
-
-    node_info.invalidate().config
-}
-
-/// Try to connect to the compute node once.
-#[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)]
-async fn connect_to_compute_once(
-    node_info: &console::CachedNodeInfo,
-    timeout: time::Duration,
-    proto: &'static str,
-) -> Result<PostgresConnection, compute::ConnectionError> {
-    let allow_self_signed_compute = node_info.allow_self_signed_compute;
-
-    node_info
-        .config
-        .connect(allow_self_signed_compute, timeout, proto)
-        .await
-}
-
-#[async_trait]
-pub trait ConnectMechanism {
-    type Connection;
-    type ConnectError;
-    type Error: From<Self::ConnectError>;
-    async fn connect_once(
-        &self,
-        node_info: &console::CachedNodeInfo,
-        timeout: time::Duration,
-    ) -> Result<Self::Connection, Self::ConnectError>;
-
-    fn update_connect_config(&self, conf: &mut compute::ConnCfg);
-}
-
-pub struct TcpMechanism<'a> {
-    /// KV-dictionary with PostgreSQL connection params.
-    pub params: &'a StartupMessageParams,
-    pub proto: &'static str,
-}
-
-#[async_trait]
-impl ConnectMechanism for TcpMechanism<'_> {
-    type Connection = PostgresConnection;
-    type ConnectError = compute::ConnectionError;
-    type Error = compute::ConnectionError;
-
-    async fn connect_once(
-        &self,
-        node_info: &console::CachedNodeInfo,
-        timeout: time::Duration,
-    ) -> Result<PostgresConnection, Self::Error> {
-        connect_to_compute_once(node_info, timeout, self.proto).await
-    }
-
-    fn update_connect_config(&self, config: &mut compute::ConnCfg) {
-        config.set_startup_params(self.params);
-    }
-}
-
-fn report_error(e: &WakeComputeError, retry: bool) {
-    use crate::console::errors::ApiError;
-    let retry = bool_to_str(retry);
-    let kind = match e {
-        WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
-        WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::LOCKED,
-            ref text,
-        }) if text.contains("written data quota exceeded")
-            || text.contains("the limit for current plan reached") =>
-        {
-            "quota_exceeded"
-        }
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::LOCKED,
-            ..
-        }) => "api_console_locked",
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::BAD_REQUEST,
-            ..
-        }) => "api_console_bad_request",
-        WakeComputeError::ApiError(ApiError::Console { status, .. })
-            if status.is_server_error() =>
-        {
-            "api_console_other_server_error"
-        }
-        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
-        WakeComputeError::TimeoutError => "timeout_error",
-    };
-    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
-}
-
-/// Try to connect to the compute node, retrying if necessary.
-/// This function might update `node_info`, so we take it by `&mut`.
-#[tracing::instrument(skip_all)]
-pub async fn connect_to_compute<M: ConnectMechanism>(
-    mechanism: &M,
-    mut node_info: console::CachedNodeInfo,
-    extra: &console::ConsoleReqExtra,
-    creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>,
-    mut latency_timer: LatencyTimer,
-) -> Result<M::Connection, M::Error>
-where
-    M::ConnectError: ShouldRetry + std::fmt::Debug,
-    M::Error: From<WakeComputeError>,
-{
-    mechanism.update_connect_config(&mut node_info.config);
-
-    // try once
-    let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-        Ok(res) => {
-            latency_timer.success();
-            return Ok(res);
-        }
-        Err(e) => {
-            error!(error = ?e, "could not connect to compute node");
-            (invalidate_cache(node_info), e)
-        }
-    };
-
-    latency_timer.cache_miss();
-
-    let mut num_retries = 1;
-
-    // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
-    info!("compute node's state has likely changed; requesting a wake-up");
-    let node_info = loop {
-        let wake_res = match creds {
-            auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
-            #[cfg(feature = "testing")]
-            auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
-            // nothing to do?
-            auth::BackendType::Link(_) => return Err(err.into()),
-            // test backend
-            #[cfg(test)]
-            auth::BackendType::Test(x) => x.wake_compute(),
-        };
-
-        match handle_try_wake(wake_res, num_retries) {
-            Err(e) => {
-                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
-                report_error(&e, false);
-                return Err(e.into());
-            }
-            // failed to wake up but we can continue to retry
-            Ok(ControlFlow::Continue(e)) => {
-                report_error(&e, true);
-                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
-            }
-            // successfully woke up a compute node and can break the wakeup loop
-            Ok(ControlFlow::Break(mut node_info)) => {
-                node_info.config.reuse_password(&config);
-                mechanism.update_connect_config(&mut node_info.config);
-                break node_info;
-            }
-        }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-
-        time::sleep(wait_duration).await;
-    };
-
-    // now that we have a new node, try connect to it repeatedly.
-    // this can error for a few reasons, for instance:
-    // * DNS connection settings haven't quite propagated yet
-    info!("wake_compute success. attempting to connect");
-    loop {
-        match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-            Ok(res) => {
-                latency_timer.success();
-                return Ok(res);
-            }
-            Err(e) => {
-                let retriable = e.should_retry(num_retries);
-                if !retriable {
-                    error!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
-                    return Err(e.into());
-                }
-                warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
-            }
-        }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-
-        time::sleep(wait_duration).await;
-    }
-}
-
-/// Attempts to wake up the compute node.
-/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
-/// * Returns Ok(Break(node)) if the wakeup succeeded
-/// * Returns Err(e) if there was an error
-pub fn handle_try_wake(
-    result: Result<console::CachedNodeInfo, WakeComputeError>,
-    num_retries: u32,
-) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
-    match result {
-        Err(err) => match &err {
-            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
-                Ok(ControlFlow::Continue(err))
-            }
-            _ => Err(err),
-        },
-        // Ready to try again.
-        Ok(new) => Ok(ControlFlow::Break(new)),
-    }
-}
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -1,68 +0,0 @@
-use crate::compute;
-use std::{error::Error, io};
-use tokio::time;
-
-/// Number of times we should retry the `/proxy_wake_compute` http request.
-/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0
-pub const NUM_RETRIES_CONNECT: u32 = 16;
-const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25);
-const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;
-
-pub trait ShouldRetry {
-    fn could_retry(&self) -> bool;
-    fn should_retry(&self, num_retries: u32) -> bool {
-        match self {
-            _ if num_retries >= NUM_RETRIES_CONNECT => false,
-            err => err.could_retry(),
-        }
-    }
-}
-
-impl ShouldRetry for io::Error {
-    fn could_retry(&self) -> bool {
-        use std::io::ErrorKind;
-        matches!(
-            self.kind(),
-            ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut
-        )
-    }
-}
-
-impl ShouldRetry for tokio_postgres::error::DbError {
-    fn could_retry(&self) -> bool {
-        use tokio_postgres::error::SqlState;
-        matches!(
-            self.code(),
-            &SqlState::CONNECTION_FAILURE
-                | &SqlState::CONNECTION_EXCEPTION
-                | &SqlState::CONNECTION_DOES_NOT_EXIST
-                | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
-        )
-    }
-}
-
-impl ShouldRetry for tokio_postgres::Error {
-    fn could_retry(&self) -> bool {
-        if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
-            io::Error::could_retry(io_err)
-        } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
-            tokio_postgres::error::DbError::could_retry(db_err)
-        } else {
-            false
-        }
-    }
-}
-
-impl ShouldRetry for compute::ConnectionError {
-    fn could_retry(&self) -> bool {
-        match self {
-            compute::ConnectionError::Postgres(err) => err.could_retry(),
-            compute::ConnectionError::CouldNotConnect(err) => err.could_retry(),
-            _ => false,
-        }
-    }
-}
-
-pub fn retry_after(num_retries: u32) -> time::Duration {
-    BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1))
-}
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -2,13 +2,10 @@

 mod mitm;

-use super::connect_compute::ConnectMechanism;
-use super::retry::ShouldRetry;
 use super::*;
 use crate::auth::backend::{ComputeUserInfo, TestBackend};
 use crate::config::CertResolver;
 use crate::console::{CachedNodeInfo, NodeInfo};
-use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
 use crate::{auth, http, sasl, scram};
 use async_trait::async_trait;
 use rstest::rstest;
@@ -426,7 +423,7 @@ impl ConnectMechanism for TestConnectMechanism {
    async fn connect_once(
        &self,
        _node_info: &console::CachedNodeInfo,
-        _timeout: std::time::Duration,
+        _timeout: time::Duration,
    ) -> Result<Self::Connection, Self::ConnectError> {
        let mut counter = self.counter.lock().unwrap();
        let action = self.sequence[*counter];
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -120,7 +120,7 @@ where
 struct PgFrame;
 impl Decoder for PgFrame {
    type Item = Bytes;
-    type Error = std::io::Error;
+    type Error = io::Error;

    fn decode(&mut self, src: &mut BytesMut) -> Result<Option<Self::Item>, Self::Error> {
        if src.len() < 5 {
@@ -136,7 +136,7 @@ impl Decoder for PgFrame {
    }
 }
 impl Encoder<Bytes> for PgFrame {
-    type Error = std::io::Error;
+    type Error = io::Error;

    fn encode(&mut self, item: Bytes, dst: &mut BytesMut) -> Result<(), Self::Error> {
        dst.extend_from_slice(&item);
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -393,10 +393,10 @@ impl Limiter {
            }
            new_limit
        };
-        crate::metrics::RATE_LIMITER_LIMIT
+        crate::proxy::RATE_LIMITER_LIMIT
            .with_label_values(&["expected"])
            .set(new_limit as i64);
-        crate::metrics::RATE_LIMITER_LIMIT
+        crate::proxy::RATE_LIMITER_LIMIT
            .with_label_values(&["actual"])
            .set(actual_limit as i64);
        self.limits.store(new_limit, Ordering::Release);
@@ -470,7 +470,7 @@ impl reqwest_middleware::Middleware for Limiter {
                )
            })?;
        info!(duration = ?start.elapsed(), "waiting for token to connect to the control plane");
-        crate::metrics::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64());
+        crate::proxy::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64());
        match next.run(req, extensions).await {
            Ok(response) => {
                self.release(token, Some(Outcome::from_reqwest_response(&response)))
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -13,8 +13,8 @@ pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;

-use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
+use crate::proxy::NUM_CLIENT_CONNECTION_GAUGE;
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::{cancellation::CancelMap, config::ProxyConfig};
 use futures::StreamExt;
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -24,12 +24,13 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
 use crate::{
    auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list},
    console,
-    metrics::{LatencyTimer, NUM_DB_CONNECTIONS_GAUGE},
-    proxy::{connect_compute::ConnectMechanism, neon_options},
+    proxy::{neon_options, LatencyTimer, NUM_DB_CONNECTIONS_GAUGE},
    usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
 };
 use crate::{compute, config};

+use crate::proxy::ConnectMechanism;
+
 use tracing::{error, warn, Span};
 use tracing::{info, info_span, Instrument};

@@ -443,7 +444,7 @@ async fn connect_to_compute(
        .await?
        .context("missing cache entry from wake_compute")?;

-    crate::proxy::connect_compute::connect_to_compute(
+    crate::proxy::connect_to_compute(
        &TokioMechanism {
            conn_id,
            conn_info,
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -29,7 +29,7 @@ use utils::http::error::ApiError;
 use utils::http::json::json_response;

 use crate::config::HttpConfig;
-use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
+use crate::proxy::NUM_CONNECTION_REQUESTS_GAUGE;

 use super::conn_pool::ConnInfo;
 use super::conn_pool::GlobalConnPool;
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -31,7 +31,6 @@ reqwest = { workspace = true, default-features = false, features = ["rustls-tls"
 aws-config = { workspace = true, default-features = false, features = ["rustls", "sso"] }

 pageserver = { path = "../pageserver" }
-pageserver_api = { path = "../libs/pageserver_api" }
 remote_storage = { path = "../libs/remote_storage" }

 tracing.workspace = true
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -7,12 +7,13 @@ use utils::generation::Generation;

 use crate::cloud_admin_api::BranchData;
 use crate::metadata_stream::stream_listing;
-use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
+use crate::{download_object_with_retries, RootTarget};
 use futures_util::{pin_mut, StreamExt};
 use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
 use pageserver::tenant::storage_layer::LayerFileName;
 use pageserver::tenant::IndexPart;
 use remote_storage::RemotePath;
+use utils::id::TenantTimelineId;

 pub(crate) struct TimelineAnalysis {
    /// Anomalies detected
@@ -38,8 +39,8 @@ impl TimelineAnalysis {
    }
 }

-pub(crate) fn branch_cleanup_and_check_errors(
-    id: &TenantShardTimelineId,
+pub(crate) async fn branch_cleanup_and_check_errors(
+    id: &TenantTimelineId,
    s3_root: &RootTarget,
    s3_active_branch: Option<&BranchData>,
    console_branch: Option<BranchData>,
@@ -237,7 +238,7 @@ fn parse_layer_object_name(name: &str) -> Result<(LayerFileName, Generation), St

 pub(crate) async fn list_timeline_blobs(
    s3_client: &Client,
-    id: TenantShardTimelineId,
+    id: TenantTimelineId,
    s3_root: &RootTarget,
 ) -> anyhow::Result<S3TimelineBlobData> {
    let mut s3_layers = HashSet::new();
--- a/s3_scrubber/src/garbage.rs
+++ b/s3_scrubber/src/garbage.rs
@@ -10,16 +10,15 @@ use aws_sdk_s3::{
    Client,
 };
 use futures_util::{pin_mut, TryStreamExt};
-use pageserver_api::shard::TenantShardId;
 use serde::{Deserialize, Serialize};
 use tokio_stream::StreamExt;
-use utils::id::TenantId;
+use utils::id::{TenantId, TenantTimelineId};

 use crate::{
    cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
    init_remote,
    metadata_stream::{stream_listing, stream_tenant_timelines, stream_tenants},
-    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, TraversingDepth,
+    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TraversingDepth,
 };

 #[derive(Serialize, Deserialize, Debug)]
@@ -30,8 +29,8 @@ enum GarbageReason {

 #[derive(Serialize, Deserialize, Debug)]
 enum GarbageEntity {
-    Tenant(TenantShardId),
-    Timeline(TenantShardTimelineId),
+    Tenant(TenantId),
+    Timeline(TenantTimelineId),
 }

 #[derive(Serialize, Deserialize, Debug)]
@@ -143,9 +142,6 @@ async fn find_garbage_inner(
        console_projects.len()
    );

-    // TODO(sharding): batch calls into Console so that we only call once for each TenantId,
-    // rather than checking the same TenantId for multiple TenantShardId
-
    // Enumerate Tenants in S3, and check if each one exists in Console
    tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
    let tenants = stream_tenants(&s3_client, &target);
@@ -153,10 +149,10 @@ async fn find_garbage_inner(
        let api_client = cloud_admin_api_client.clone();
        let console_projects = &console_projects;
        async move {
-            match console_projects.get(&t.tenant_id) {
+            match console_projects.get(&t) {
                Some(project_data) => Ok((t, Some(project_data.clone()))),
                None => api_client
-                    .find_tenant_project(t.tenant_id)
+                    .find_tenant_project(t)
                    .await
                    .map_err(|e| anyhow::anyhow!(e))
                    .map(|r| (t, r)),
@@ -170,21 +166,21 @@ async fn find_garbage_inner(
    // checks if they are enabled by the `depth` parameter.
    pin_mut!(tenants_checked);
    let mut garbage = GarbageList::new(node_kind, bucket_config);
-    let mut active_tenants: Vec<TenantShardId> = vec![];
+    let mut active_tenants: Vec<TenantId> = vec![];
    let mut counter = 0;
    while let Some(result) = tenants_checked.next().await {
-        let (tenant_shard_id, console_result) = result?;
+        let (tenant_id, console_result) = result?;

        // Paranoia check
        if let Some(project) = &console_result {
-            assert!(project.tenant == tenant_shard_id.tenant_id);
+            assert!(project.tenant == tenant_id);
        }

-        if garbage.maybe_append(GarbageEntity::Tenant(tenant_shard_id), console_result) {
-            tracing::debug!("Tenant {tenant_shard_id} is garbage");
+        if garbage.maybe_append(GarbageEntity::Tenant(tenant_id), console_result) {
+            tracing::debug!("Tenant {tenant_id} is garbage");
        } else {
-            tracing::debug!("Tenant {tenant_shard_id} is active");
-            active_tenants.push(tenant_shard_id);
+            tracing::debug!("Tenant {tenant_id} is active");
+            active_tenants.push(tenant_id);
        }

        counter += 1;
@@ -270,13 +266,13 @@ impl std::fmt::Display for PurgeMode {
 pub async fn get_tenant_objects(
    s3_client: &Arc<Client>,
    target: RootTarget,
-    tenant_shard_id: TenantShardId,
+    tenant_id: TenantId,
 ) -> anyhow::Result<Vec<ObjectIdentifier>> {
-    tracing::debug!("Listing objects in tenant {tenant_shard_id}");
+    tracing::debug!("Listing objects in tenant {tenant_id}");
    // TODO: apply extra validation based on object modification time.  Don't purge
    // tenants where any timeline's index_part.json has been touched recently.

-    let mut tenant_root = target.tenant_root(&tenant_shard_id);
+    let mut tenant_root = target.tenant_root(&tenant_id);

    // Remove delimiter, so that object listing lists all keys in the prefix and not just
    // common prefixes.
@@ -289,7 +285,7 @@ pub async fn get_tenant_objects(
 pub async fn get_timeline_objects(
    s3_client: &Arc<Client>,
    target: RootTarget,
-    ttid: TenantShardTimelineId,
+    ttid: TenantTimelineId,
 ) -> anyhow::Result<Vec<ObjectIdentifier>> {
    tracing::debug!("Listing objects in timeline {ttid}");
    let mut timeline_root = target.timeline_root(&ttid);
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -22,7 +22,6 @@ use aws_sdk_s3::{Client, Config};

 use clap::ValueEnum;
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
-use pageserver_api::shard::TenantShardId;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
 use std::io::IsTerminal;
@@ -30,7 +29,7 @@ use tokio::io::AsyncReadExt;
 use tracing::error;
 use tracing_appender::non_blocking::WorkerGuard;
 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
-use utils::id::TimelineId;
+use utils::id::{TenantId, TenantTimelineId};

 const MAX_RETRIES: usize = 20;
 const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN";
@@ -45,35 +44,6 @@ pub struct S3Target {
    pub delimiter: String,
 }

-/// Convenience for referring to timelines within a particular shard: more ergonomic
-/// than using a 2-tuple.
-///
-/// This is the shard-aware equivalent of TenantTimelineId.  It's defined here rather
-/// than somewhere more broadly exposed, because this kind of thing is rarely needed
-/// in the pageserver, as all timeline objects existing in the scope of a particular
-/// tenant: the scrubber is different in that it handles collections of data referring to many
-/// TenantShardTimelineIds in on place.
-#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq)]
-pub struct TenantShardTimelineId {
-    tenant_shard_id: TenantShardId,
-    timeline_id: TimelineId,
-}
-
-impl TenantShardTimelineId {
-    fn new(tenant_shard_id: TenantShardId, timeline_id: TimelineId) -> Self {
-        Self {
-            tenant_shard_id,
-            timeline_id,
-        }
-    }
-}
-
-impl Display for TenantShardTimelineId {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}/{}", self.tenant_shard_id, self.timeline_id)
-    }
-}
-
 #[derive(clap::ValueEnum, Debug, Clone, Copy, PartialEq, Eq)]
 pub enum TraversingDepth {
    Tenant,
@@ -140,19 +110,19 @@ impl RootTarget {
        }
    }

-    pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target {
+    pub fn tenant_root(&self, tenant_id: &TenantId) -> S3Target {
        self.tenants_root().with_sub_segment(&tenant_id.to_string())
    }

-    pub fn timelines_root(&self, tenant_id: &TenantShardId) -> S3Target {
+    pub fn timelines_root(&self, tenant_id: &TenantId) -> S3Target {
        match self {
            Self::Pageserver(_) => self.tenant_root(tenant_id).with_sub_segment("timelines"),
            Self::Safekeeper(_) => self.tenant_root(tenant_id),
        }
    }

-    pub fn timeline_root(&self, id: &TenantShardTimelineId) -> S3Target {
-        self.timelines_root(&id.tenant_shard_id)
+    pub fn timeline_root(&self, id: &TenantTimelineId) -> S3Target {
+        self.timelines_root(&id.tenant_id)
            .with_sub_segment(&id.timeline_id.to_string())
    }

--- a/s3_scrubber/src/metadata_stream.rs
+++ b/s3_scrubber/src/metadata_stream.rs
@@ -3,15 +3,14 @@ use async_stream::{stream, try_stream};
 use aws_sdk_s3::{types::ObjectIdentifier, Client};
 use tokio_stream::Stream;

-use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId};
-use pageserver_api::shard::TenantShardId;
-use utils::id::TimelineId;
+use crate::{list_objects_with_retries, RootTarget, S3Target, TenantId};
+use utils::id::{TenantTimelineId, TimelineId};

 /// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
 pub fn stream_tenants<'a>(
    s3_client: &'a Client,
    target: &'a RootTarget,
-) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
+) -> impl Stream<Item = anyhow::Result<TenantId>> + 'a {
    try_stream! {
        let mut continuation_token = None;
        let tenants_target = target.tenants_root();
@@ -45,14 +44,14 @@ pub fn stream_tenants<'a>(
    }
 }

-/// Given a TenantShardId, output a stream of the timelines within that tenant, discovered
+/// Given a TenantId, output a stream of the timelines within that tenant, discovered
 /// using ListObjectsv2.  The listing is done before the stream is built, so that this
 /// function can be used to generate concurrency on a stream using buffer_unordered.
 pub async fn stream_tenant_timelines<'a>(
    s3_client: &'a Client,
    target: &'a RootTarget,
-    tenant: TenantShardId,
-) -> anyhow::Result<impl Stream<Item = Result<TenantShardTimelineId, anyhow::Error>> + 'a> {
+    tenant: TenantId,
+) -> anyhow::Result<impl Stream<Item = Result<TenantTimelineId, anyhow::Error>> + 'a> {
    let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
    let mut continuation_token = None;
    let timelines_target = target.timelines_root(&tenant);
@@ -99,7 +98,7 @@ pub async fn stream_tenant_timelines<'a>(
    Ok(stream! {
        for i in timeline_ids {
            let id = i?;
-            yield Ok(TenantShardTimelineId::new(tenant, id));
+            yield Ok(TenantTimelineId::new(tenant, id));
        }
    })
 }
--- a/s3_scrubber/src/scan_metadata.rs
+++ b/s3_scrubber/src/scan_metadata.rs
@@ -5,19 +5,20 @@ use crate::checks::{
    TimelineAnalysis,
 };
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
-use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
+use crate::{init_remote, BucketConfig, NodeKind, RootTarget};
 use aws_sdk_s3::Client;
 use futures_util::{pin_mut, StreamExt, TryStreamExt};
 use histogram::Histogram;
 use pageserver::tenant::IndexPart;
 use serde::Serialize;
+use utils::id::TenantTimelineId;

 #[derive(Serialize)]
 pub struct MetadataSummary {
    count: usize,
-    with_errors: HashSet<TenantShardTimelineId>,
-    with_warnings: HashSet<TenantShardTimelineId>,
-    with_garbage: HashSet<TenantShardTimelineId>,
+    with_errors: HashSet<TenantTimelineId>,
+    with_warnings: HashSet<TenantTimelineId>,
+    with_garbage: HashSet<TenantTimelineId>,
    indices_by_version: HashMap<usize, usize>,

    layer_count: MinMaxHisto,
@@ -131,7 +132,7 @@ impl MetadataSummary {
        }
    }

-    fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
+    fn update_analysis(&mut self, id: &TenantTimelineId, analysis: &TimelineAnalysis) {
        if !analysis.errors.is_empty() {
            self.with_errors.insert(*id);
        }
@@ -198,8 +199,8 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<Metada
    async fn report_on_timeline(
        s3_client: &Client,
        target: &RootTarget,
-        ttid: TenantShardTimelineId,
-    ) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> {
+        ttid: TenantTimelineId,
+    ) -> anyhow::Result<(TenantTimelineId, S3TimelineBlobData)> {
        let data = list_timeline_blobs(s3_client, ttid, target).await?;
        Ok((ttid, data))
    }
@@ -212,7 +213,8 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<Metada
        let (ttid, data) = i?;
        summary.update_data(&data);

-        let analysis = branch_cleanup_and_check_errors(&ttid, &target, None, None, Some(data));
+        let analysis =
+            branch_cleanup_and_check_errors(&ttid, &target, None, None, Some(data)).await;

        summary.update_analysis(&ttid, &analysis);
    }
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -79,9 +79,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
    # AWS S3 may emit 500 errors for keys in a DeleteObjects response: we retry these
    # and it is not a failure of our code when it happens.
    ".*DeleteObjects.*We encountered an internal error. Please try again.*",
-    # During shutdown, DownloadError::Cancelled may be logged as an error.  Cleaning this
-    # up is tracked in https://github.com/neondatabase/neon/issues/6096
-    ".*Cancelled, shutting down.*",
 )


--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -52,16 +52,7 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N
        TenantId(t["id"]) for t in ps_http.tenant_list()
    ], "tenant should not be attached after negative test"

-    env.pageserver.allowed_errors.extend(
-        [
-            # This fixture detaches the tenant, and tests using it will tend to re-attach it
-            # shortly after. There may be un-processed deletion_queue validations from the
-            # initial attachment
-            ".*Dropped remote consistent LSN updates.*",
-            # This fixture is for tests that will intentionally generate 400 responses
-            ".*Error processing HTTP request: Bad request",
-        ]
-    )
+    env.pageserver.allowed_errors.append(".*Error processing HTTP request: Bad request")

    def log_contains_bad_request():
        env.pageserver.log_contains(".*Error processing HTTP request: Bad request")
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -20,7 +20,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):

    env.pageserver.allowed_errors.extend(
        [
-            ".*get_value_reconstruct_data for layer .*",
+            ".*layer loading failed:.*",
            ".*could not find data for key.*",
            ".*is not active. Current state: Broken.*",
            ".*will not become active. Current state: Broken.*",
@@ -83,7 +83,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
    # (We don't check layer file contents on startup, when loading the timeline)
    #
    # This will change when we implement checksums for layers
-    with pytest.raises(Exception, match="get_value_reconstruct_data for layer ") as err:
+    with pytest.raises(Exception, match="layer loading failed:") as err:
        pg2.start()
    log.info(
        f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -300,8 +300,7 @@ def test_timeline_initial_logical_size_calculation_cancellation(
    env = neon_env_builder.init_start()
    client = env.pageserver.http_client()

-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
+    tenant_id, timeline_id = env.neon_cli.create_tenant()

    # load in some data
    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
@@ -733,142 +732,3 @@ def wait_for_timeline_size_init(
    raise Exception(
        f"timed out while waiting for current_logical_size of a timeline to reach its non-incremental value, details: {timeline_details}"
    )
-
-
-def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
-    """
-    Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete
-    before proceeding.  However, they skip this if a client is actively trying to access them.
-
-    This test is not purely about logical sizes, but logical size calculation is the phase that we
-    use as a proxy for "warming up" in this test: it happens within the semaphore guard used
-    to limit concurrent tenant warm-up.
-    """
-
-    # We will run with the limit set to 1, so that once we have one tenant stuck
-    # in a pausable failpoint, the rest are prevented from proceeding through warmup.
-    neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
-
-    env = neon_env_builder.init_start()
-    pageserver_http = env.pageserver.http_client()
-
-    # Create some tenants
-    n_tenants = 10
-    tenant_ids = {env.initial_tenant}
-    for _i in range(0, n_tenants - 1):
-        tenant_id = TenantId.generate()
-        env.pageserver.tenant_create(tenant_id)
-
-        # Empty tenants are not subject to waiting for logical size calculations, because
-        # those hapen on timeline level
-        timeline_id = TimelineId.generate()
-        env.neon_cli.create_timeline(
-            new_branch_name="main", tenant_id=tenant_id, timeline_id=timeline_id
-        )
-
-        tenant_ids.add(tenant_id)
-
-    # Restart pageserver with logical size calculations paused
-    env.pageserver.stop()
-    env.pageserver.start(
-        extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"}
-    )
-
-    def get_tenant_states():
-        states = {}
-        for tenant_id in tenant_ids:
-            tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
-            states[tenant_id] = tenant["state"]["slug"]
-        log.info(f"Tenant states: {states}")
-        return states
-
-    def at_least_one_active():
-        assert "Active" in set(get_tenant_states().values())
-
-    # One tenant should activate, then get stuck in their logical size calculation
-    wait_until(10, 1, at_least_one_active)
-
-    # Wait some walltime to gain confidence that other tenants really are stuck and not proceeding to activate
-    time.sleep(5)
-
-    # We should see one tenant win the activation race, and enter logical size calculation.  The rest
-    # will stay in Attaching state, waiting for the "warmup_limit" semaphore
-    expect_activated = 1
-    states = get_tenant_states()
-    assert len([s for s in states.values() if s == "Active"]) == expect_activated
-    assert len([s for s in states.values() if s == "Attaching"]) == n_tenants - expect_activated
-
-    assert (
-        pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
-    )
-
-    # This is zero, and subsequent checks are expect_activated - 1, because this counter does not
-    # count how may tenants are Active, it counts how many have finished warmup.  The first tenant
-    # that reached Active is still stuck in its local size calculation, and has therefore not finished warmup.
-    assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == 0
-
-    # If a client accesses one of the blocked tenants, it should skip waiting for warmup and
-    # go active as fast as it can.
-    stuck_tenant_id = list(
-        [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
-    )[0][0]
-
-    endpoint = env.endpoints.create_start(branch_name="main", tenant_id=stuck_tenant_id)
-    endpoint.safe_psql_many(
-        [
-            "CREATE TABLE foo (x INTEGER)",
-            "INSERT INTO foo SELECT g FROM generate_series(1, 10) g",
-        ]
-    )
-    endpoint.stop()
-
-    # That one that we successfully accessed is now Active
-    expect_activated += 1
-    assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active"
-    assert (
-        pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
-        == expect_activated - 1
-    )
-
-    # The ones we didn't touch are still in Attaching
-    assert (
-        len([s for s in get_tenant_states().values() if s == "Attaching"])
-        == n_tenants - expect_activated
-    )
-
-    # Timeline creation operations also wake up Attaching tenants
-    stuck_tenant_id = list(
-        [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
-    )[0][0]
-    pageserver_http.timeline_create(env.pg_version, stuck_tenant_id, TimelineId.generate())
-    expect_activated += 1
-    assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active"
-    assert (
-        len([s for s in get_tenant_states().values() if s == "Attaching"])
-        == n_tenants - expect_activated
-    )
-
-    assert (
-        pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
-        == expect_activated - 1
-    )
-
-    # When we unblock logical size calculation, all tenants should proceed to active state via
-    # the warmup route.
-    pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
-
-    def all_active():
-        assert all(s == "Active" for s in get_tenant_states().values())
-
-    wait_until(10, 1, all_active)
-
-    # Final control check: restarting with no failpoints at all results in all tenants coming active
-    # without being prompted by client I/O
-    env.pageserver.stop()
-    env.pageserver.start()
-    wait_until(10, 1, all_active)
-
-    assert (
-        pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
-    )
-    assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants
Author	SHA1	Message	Date
Conrad Ludgate	6d2bbffdab	only for console	2023-12-15 12:28:50 +00:00
Conrad Ludgate	7151bcc175	proxy console force http2	2023-12-15 12:26:51 +00:00