compute_ctl: log ttid everywhere.

Allows to easily find compute logs by timeline id, and now loki agent can make a label from it. pairs with https://github.com/neondatabase/cloud/pull/7259
2026-05-21 15:10:44 +00:00 · 2023-10-06 01:02:41 +03:00
80 changed files with 1874 additions and 5058 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -36,9 +36,6 @@ license = "Apache-2.0"
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
 async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
-azure_core = "0.16.0"
-azure_storage = "0.16"
-azure_storage_blobs = "0.16.0"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
@@ -79,7 +76,6 @@ hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
-http-types = "2"
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -224,8 +224,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
-    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.0.tar.gz -O pgvector.tar.gz && \
+    echo "d8aa3504b215467ca528525a6de12c3f85f9891b091ce0e5864dd8a9b757f77b pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -368,8 +368,8 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
 FROM build-deps AS plpgsql-check-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
-    echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.4.0.tar.gz -O plpgsql_check.tar.gz && \
+    echo "9ba58387a279b35a3bfa39ee611e5684e6cddb2ba046ddb2c5190b3bd2ca254a plpgsql_check.tar.gz" | sha256sum --check && \
    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -44,7 +44,7 @@ use std::{thread, time::Duration};
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
-use tracing::{error, info};
+use tracing::{error, info, info_span};
 use url::Url;

 use compute_api::responses::ComputeStatus;
@@ -57,6 +57,7 @@ use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
+use utils::id::TenantTimelineId;

 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
@@ -249,11 +250,20 @@ fn main() -> Result<()> {

    state.status = ComputeStatus::Init;
    compute.state_changed.notify_all();
+    let pspec = state.pspec.as_ref().expect("spec must be set");
+    let ttid = TenantTimelineId {
+        tenant_id: pspec.tenant_id,
+        timeline_id: pspec.timeline_id,
+    };
    drop(state);

+    // Log ttid everywhere for easier log identification (e.g. loki agent can
+    // create label on that).
+    let _guard = info_span!("", ttid = %ttid).entered();
+
    // Launch remaining service threads
-    let _monitor_handle = launch_monitor(&compute);
-    let _configurator_handle = launch_configurator(&compute);
+    let _monitor_handle = launch_monitor(&compute, ttid);
+    let _configurator_handle = launch_configurator(&compute, ttid);

    // Start Postgres
    let mut delay_exit = false;
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -692,11 +692,10 @@ impl ComputeNode {
        // Proceed with post-startup configuration. Note, that order of operations is important.
        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
        create_neon_superuser(spec, &mut client)?;
-        cleanup_instance(&mut client)?;
        handle_roles(spec, &mut client)?;
        handle_databases(spec, &mut client)?;
        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
-        handle_grants(spec, &mut client, self.connstr.as_str())?;
+        handle_grants(spec, self.connstr.as_str())?;
        handle_extensions(spec, &mut client)?;
        create_availability_check_data(&mut client)?;

@@ -732,11 +731,10 @@ impl ComputeNode {
        // Disable DDL forwarding because control plane already knows about these roles/databases.
        if spec.mode == ComputeMode::Primary {
            client.simple_query("SET neon.forward_ddl = false")?;
-            cleanup_instance(&mut client)?;
            handle_roles(&spec, &mut client)?;
            handle_databases(&spec, &mut client)?;
            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
-            handle_grants(&spec, &mut client, self.connstr.as_str())?;
+            handle_grants(&spec, self.connstr.as_str())?;
            handle_extensions(&spec, &mut client)?;
        }

--- a/compute_tools/src/configurator.rs
+++ b/compute_tools/src/configurator.rs
@@ -4,11 +4,13 @@ use std::thread;
 use tracing::{error, info, instrument};

 use compute_api::responses::ComputeStatus;
+use utils::id::TenantTimelineId;

 use crate::compute::ComputeNode;

-#[instrument(skip_all)]
-fn configurator_main_loop(compute: &Arc<ComputeNode>) {
+// Log ttid everywhere
+#[instrument(name = "", fields(ttid = %ttid), skip_all)]
+fn configurator_main_loop(compute: &Arc<ComputeNode>, ttid: TenantTimelineId) {
    info!("waiting for reconfiguration requests");
    loop {
        let state = compute.state.lock().unwrap();
@@ -41,13 +43,16 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
    }
 }

-pub fn launch_configurator(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
+pub fn launch_configurator(
+    compute: &Arc<ComputeNode>,
+    ttid: TenantTimelineId,
+) -> thread::JoinHandle<()> {
    let compute = Arc::clone(compute);

    thread::Builder::new()
        .name("compute-configurator".into())
        .spawn(move || {
-            configurator_main_loop(&compute);
+            configurator_main_loop(&compute, ttid);
            info!("configurator thread is exited");
        })
        .expect("cannot launch configurator thread")
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -3,7 +3,8 @@ use std::{thread, time::Duration};

 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
-use tracing::{debug, info};
+use tracing::{debug, info, instrument};
+use utils::id::TenantTimelineId;

 use crate::compute::ComputeNode;

@@ -12,7 +13,8 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
 // Spin in a loop and figure out the last activity time in the Postgres.
 // Then update it in the shared state. This function never errors out.
 // XXX: the only expected panic is at `RwLock` unwrap().
-fn watch_compute_activity(compute: &ComputeNode) {
+#[instrument(name = "", fields(ttid = %ttid), skip_all)]
+fn watch_compute_activity(compute: &ComputeNode, ttid: TenantTimelineId) {
    // Suppose that `connstr` doesn't change
    let connstr = compute.connstr.as_str();
    // Define `client` outside of the loop to reuse existing connection if it's active.
@@ -103,11 +105,11 @@ fn watch_compute_activity(compute: &ComputeNode) {
 }

 /// Launch a separate compute monitor thread and return its `JoinHandle`.
-pub fn launch_monitor(state: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
+pub fn launch_monitor(state: &Arc<ComputeNode>, ttid: TenantTimelineId) -> thread::JoinHandle<()> {
    let state = Arc::clone(state);

    thread::Builder::new()
        .name("compute-monitor".into())
-        .spawn(move || watch_compute_activity(&state))
+        .spawn(move || watch_compute_activity(&state, ttid))
        .expect("cannot launch compute monitor thread")
 }
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -1,4 +1,3 @@
-use std::collections::HashMap;
 use std::fmt::Write;
 use std::fs;
 use std::fs::File;
@@ -206,37 +205,22 @@ pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
 }

 /// Build a list of existing Postgres databases
-pub fn get_existing_dbs(client: &mut Client) -> Result<HashMap<String, Database>> {
-    // `pg_database.datconnlimit = -2` means that the database is in the
-    // invalid state. See:
-    //   https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
-    let postgres_dbs: Vec<Database> = client
+pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
+    let postgres_dbs = client
        .query(
-            "SELECT
-                datname AS name,
-                datdba::regrole::text AS owner,
-                NOT datallowconn AS restrict_conn,
-                datconnlimit = - 2 AS invalid
-            FROM
-                pg_catalog.pg_database;",
+            "SELECT datname, datdba::regrole::text as owner
+               FROM pg_catalog.pg_database;",
            &[],
        )?
        .iter()
        .map(|row| Database {
-            name: row.get("name"),
+            name: row.get("datname"),
            owner: row.get("owner"),
-            restrict_conn: row.get("restrict_conn"),
-            invalid: row.get("invalid"),
            options: None,
        })
        .collect();

-    let dbs_map = postgres_dbs
-        .iter()
-        .map(|db| (db.name.clone(), db.clone()))
-        .collect::<HashMap<_, _>>();
-
-    Ok(dbs_map)
+    Ok(postgres_dbs)
 }

 /// Wait for Postgres to become ready to accept connections. It's ready to
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -13,7 +13,7 @@ use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;

 use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse};
-use compute_api::spec::{ComputeSpec, PgIdent, Role};
+use compute_api::spec::{ComputeSpec, Database, PgIdent, Role};

 // Do control plane request and return response if any. In case of error it
 // returns a bool flag indicating whether it makes sense to retry the request
@@ -161,38 +161,6 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
    Ok(())
 }

-/// Compute could be unexpectedly shut down, for example, during the
-/// database dropping. This leaves the database in the invalid state,
-/// which prevents new db creation with the same name. This function
-/// will clean it up before proceeding with catalog updates. All
-/// possible future cleanup operations may go here too.
-#[instrument(skip_all)]
-pub fn cleanup_instance(client: &mut Client) -> Result<()> {
-    let existing_dbs = get_existing_dbs(client)?;
-
-    for (_, db) in existing_dbs {
-        if db.invalid {
-            // After recent commit in Postgres, interrupted DROP DATABASE
-            // leaves the database in the invalid state. According to the
-            // commit message, the only option for user is to drop it again.
-            // See:
-            //   https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
-            //
-            // Postgres Neon extension is done the way, that db is de-registered
-            // in the control plane metadata only after it is dropped. So there is
-            // a chance that it still thinks that db should exist. This means
-            // that it will be re-created by `handle_databases()`. Yet, it's fine
-            // as user can just repeat drop (in vanilla Postgres they would need
-            // to do the same, btw).
-            let query = format!("DROP DATABASE IF EXISTS {}", db.name.pg_quote());
-            info!("dropping invalid database {}", db.name);
-            client.execute(query.as_str(), &[])?;
-        }
-    }
-
-    Ok(())
-}
-
 /// Given a cluster spec json and open transaction it handles roles creation,
 /// deletion and update.
 #[instrument(skip_all)]
@@ -411,13 +379,13 @@ fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent
 /// which together provide us idempotency.
 #[instrument(skip_all)]
 pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
-    let existing_dbs = get_existing_dbs(client)?;
+    let existing_dbs: Vec<Database> = get_existing_dbs(client)?;

    // Print a list of existing Postgres databases (only in debug mode)
    if span_enabled!(Level::INFO) {
        info!("postgres databases:");
-        for (dbname, db) in &existing_dbs {
-            info!("    {}:{}", dbname, db.owner);
+        for r in &existing_dbs {
+            info!("    {}:{}", r.name, r.owner);
        }
    }

@@ -471,7 +439,8 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                "rename_db" => {
                    let new_name = op.new_name.as_ref().unwrap();

-                    if existing_dbs.get(&op.name).is_some() {
+                    // XXX: with a limited number of roles it is fine, but consider making it a HashMap
+                    if existing_dbs.iter().any(|r| r.name == op.name) {
                        let query: String = format!(
                            "ALTER DATABASE {} RENAME TO {}",
                            op.name.pg_quote(),
@@ -488,12 +457,14 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    }

    // Refresh Postgres databases info to handle possible renames
-    let existing_dbs = get_existing_dbs(client)?;
+    let existing_dbs: Vec<Database> = get_existing_dbs(client)?;

    info!("cluster spec databases:");
    for db in &spec.cluster.databases {
        let name = &db.name;
-        let pg_db = existing_dbs.get(name);
+
+        // XXX: with a limited number of databases it is fine, but consider making it a HashMap
+        let pg_db = existing_dbs.iter().find(|r| r.name == *name);

        enum DatabaseAction {
            None,
@@ -559,32 +530,13 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
 /// to allow users creating trusted extensions and re-creating `public` schema, for example.
 #[instrument(skip_all)]
-pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> Result<()> {
-    info!("modifying database permissions");
-    let existing_dbs = get_existing_dbs(client)?;
+pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> {
+    info!("cluster spec grants:");

    // Do some per-database access adjustments. We'd better do this at db creation time,
    // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
    // atomically.
    for db in &spec.cluster.databases {
-        match existing_dbs.get(&db.name) {
-            Some(pg_db) => {
-                if pg_db.restrict_conn || pg_db.invalid {
-                    info!(
-                        "skipping grants for db {} (invalid: {}, connections not allowed: {})",
-                        db.name, pg_db.invalid, pg_db.restrict_conn
-                    );
-                    continue;
-                }
-            }
-            None => {
-                bail!(
-                    "database {} doesn't exist in Postgres after handle_databases()",
-                    db.name
-                );
-            }
-        }
-
        let mut conf = Config::from_str(connstr)?;
        conf.dbname(&db.name);

@@ -623,11 +575,6 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) ->

        // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
        // This is needed because since postgres 15 this privilege is removed by default.
-        // TODO: web_access isn't created for almost 1 year. It could be that we have
-        // active users of 1 year old projects, but hopefully not, so check it and
-        // remove this code if possible. The worst thing that could happen is that
-        // user won't be able to use public schema in NEW databases created in the
-        // very OLD project.
        let grant_query = "DO $$\n\
                BEGIN\n\
                    IF EXISTS(\n\
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -86,7 +86,7 @@ where
        .stdout(process_log_file)
        .stderr(same_file_for_stderr)
        .args(args);
-    let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
+    let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
    filled_cmd.envs(envs);

    let pid_file_to_check = match initial_pid_file {
@@ -238,13 +238,11 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
    filled_cmd
 }

-fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
+fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
    for env_key in [
        "AWS_ACCESS_KEY_ID",
        "AWS_SECRET_ACCESS_KEY",
        "AWS_SESSION_TOKEN",
-        "AZURE_STORAGE_ACCOUNT",
-        "AZURE_STORAGE_ACCESS_KEY",
    ] {
        if let Ok(value) = std::env::var(env_key) {
            cmd = cmd.env(env_key, value);
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -116,7 +116,6 @@ fn main() -> Result<()> {
            "attachment_service" => handle_attachment_service(sub_args, &env),
            "safekeeper" => handle_safekeeper(sub_args, &env),
            "endpoint" => handle_endpoint(sub_args, &env),
-            "mappings" => handle_mappings(sub_args, &mut env),
            "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
            _ => bail!("unexpected subcommand {sub_name}"),
        };
@@ -817,38 +816,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
    Ok(())
 }

-fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
-    let (sub_name, sub_args) = match sub_match.subcommand() {
-        Some(ep_subcommand_data) => ep_subcommand_data,
-        None => bail!("no mappings subcommand provided"),
-    };
-
-    match sub_name {
-        "map" => {
-            let branch_name = sub_args
-                .get_one::<String>("branch-name")
-                .expect("branch-name argument missing");
-
-            let tenant_id = sub_args
-                .get_one::<String>("tenant-id")
-                .map(|x| TenantId::from_str(x))
-                .expect("tenant-id argument missing")
-                .expect("malformed tenant-id arg");
-
-            let timeline_id = sub_args
-                .get_one::<String>("timeline-id")
-                .map(|x| TimelineId::from_str(x))
-                .expect("timeline-id argument missing")
-                .expect("malformed timeline-id arg");
-
-            env.register_branch_mapping(branch_name.to_owned(), tenant_id, timeline_id)?;
-
-            Ok(())
-        }
-        other => unimplemented!("mappings subcommand {other}"),
-    }
-}
-
 fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
        let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
@@ -1117,7 +1084,6 @@ fn cli() -> Command {
    // --id, when using a pageserver command
    let pageserver_id_arg = Arg::new("pageserver-id")
        .long("id")
-        .global(true)
        .help("pageserver id")
        .required(false);
    // --pageserver-id when using a non-pageserver command
@@ -1288,20 +1254,17 @@ fn cli() -> Command {
            Command::new("pageserver")
                .arg_required_else_help(true)
                .about("Manage pageserver")
-                .arg(pageserver_id_arg)
                .subcommand(Command::new("status"))
-                .subcommand(Command::new("start")
-                    .about("Start local pageserver")
-                    .arg(pageserver_config_args.clone())
-                )
-                .subcommand(Command::new("stop")
-                    .about("Stop local pageserver")
-                    .arg(stop_mode_arg.clone())
-                )
-                .subcommand(Command::new("restart")
-                    .about("Restart local pageserver")
-                    .arg(pageserver_config_args.clone())
-                )
+                .arg(pageserver_id_arg.clone())
+                .subcommand(Command::new("start").about("Start local pageserver")
+                .arg(pageserver_id_arg.clone())
+                .arg(pageserver_config_args.clone()))
+                .subcommand(Command::new("stop").about("Stop local pageserver")
+                .arg(pageserver_id_arg.clone())
+                            .arg(stop_mode_arg.clone()))
+                .subcommand(Command::new("restart").about("Restart local pageserver")
+                .arg(pageserver_id_arg.clone())
+                .arg(pageserver_config_args.clone()))
        )
        .subcommand(
            Command::new("attachment_service")
@@ -1358,8 +1321,8 @@ fn cli() -> Command {
                    .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
                    .arg(endpoint_id_arg.clone())
                    .arg(tenant_id_arg.clone())
-                    .arg(branch_name_arg.clone())
-                    .arg(timeline_id_arg.clone())
+                    .arg(branch_name_arg)
+                    .arg(timeline_id_arg)
                    .arg(lsn_arg)
                    .arg(pg_port_arg)
                    .arg(http_port_arg)
@@ -1372,7 +1335,7 @@ fn cli() -> Command {
                .subcommand(
                    Command::new("stop")
                    .arg(endpoint_id_arg)
-                    .arg(tenant_id_arg.clone())
+                    .arg(tenant_id_arg)
                    .arg(
                        Arg::new("destroy")
                            .help("Also delete data directory (now optional, should be default in future)")
@@ -1383,18 +1346,6 @@ fn cli() -> Command {
                )

        )
-        .subcommand(
-            Command::new("mappings")
-                .arg_required_else_help(true)
-                .about("Manage neon_local branch name mappings")
-                .subcommand(
-                    Command::new("map")
-                        .about("Create new mapping which cannot exist already")
-                        .arg(branch_name_arg.clone())
-                        .arg(tenant_id_arg.clone())
-                        .arg(timeline_id_arg.clone())
-                )
-        )
        // Obsolete old name for 'endpoint'. We now just print an error if it's used.
        .subcommand(
            Command::new("pg")
--- a/docs/pageserver-services.md
+++ b/docs/pageserver-services.md
@@ -96,16 +96,6 @@ prefix_in_bucket = '/test_prefix/'

 `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed.

-or
-
-```toml
-[remote_storage]
-container_name = 'some-container-name'
-container_region = 'us-east'
-prefix_in_container = '/test-prefix/'
-```
-
-`AZURE_STORAGE_ACCOUNT` and `AZURE_STORAGE_ACCESS_KEY` env variables can be used to specify the azure credentials if needed.

 ## Repository background tasks

--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -200,12 +200,6 @@ pub struct Database {
    pub name: PgIdent,
    pub owner: PgIdent,
    pub options: GenericOptions,
-    // These are derived flags, not present in the spec file.
-    // They are never set by the control plane.
-    #[serde(skip_deserializing, default)]
-    pub restrict_conn: bool,
-    #[serde(skip_deserializing, default)]
-    pub invalid: bool,
 }

 /// Common type representing both SQL statement params with or without value,
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -442,20 +442,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            trace!("got message {:?}", msg);

            let result = self.process_message(handler, msg, &mut query_string).await;
-            tokio::select!(
-                biased;
-                _ = shutdown_watcher() => {
-                    // We were requested to shut down.
-                    tracing::info!("shutdown request received during response flush");
-                    return Ok(())
-                },
-                flush_r = self.flush() => {
-                    flush_r?;
-                }
-            );
-
+            self.flush().await?;
            match result? {
                ProcessMsgResult::Continue => {
+                    self.flush().await?;
                    continue;
                }
                ProcessMsgResult::Break => break,
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -13,7 +13,6 @@ aws-types.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
 aws-credential-types.workspace = true
-bytes.workspace = true
 camino.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 serde.workspace = true
@@ -27,12 +26,6 @@ metrics.workspace = true
 utils.workspace = true
 pin-project-lite.workspace = true
 workspace_hack.workspace = true
-azure_core.workspace = true
-azure_storage.workspace = true
-azure_storage_blobs.workspace = true
-futures-util.workspace = true
-http-types.workspace = true
-itertools.workspace = true

 [dev-dependencies]
 camino-tempfile.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -1,381 +0,0 @@
-//! Azure Blob Storage wrapper
-
-use std::num::NonZeroU32;
-use std::{borrow::Cow, collections::HashMap, io::Cursor};
-
-use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
-use anyhow::Result;
-use azure_core::request_options::{MaxResults, Metadata, Range};
-use azure_core::Header;
-use azure_storage::StorageCredentials;
-use azure_storage_blobs::prelude::ClientBuilder;
-use azure_storage_blobs::{
-    blob::operations::GetBlobBuilder,
-    prelude::{BlobClient, ContainerClient},
-};
-use futures_util::StreamExt;
-use http_types::StatusCode;
-use tokio::io::AsyncRead;
-use tracing::debug;
-
-use crate::s3_bucket::RequestKind;
-use crate::{
-    AzureConfig, ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage,
-    StorageMetadata,
-};
-
-pub struct AzureBlobStorage {
-    client: ContainerClient,
-    prefix_in_container: Option<String>,
-    max_keys_per_list_response: Option<NonZeroU32>,
-    concurrency_limiter: ConcurrencyLimiter,
-}
-
-impl AzureBlobStorage {
-    pub fn new(azure_config: &AzureConfig) -> Result<Self> {
-        debug!(
-            "Creating azure remote storage for azure container {}",
-            azure_config.container_name
-        );
-
-        let account =
-            std::env::var("AZURE_STORAGE_ACCOUNT").expect("missing AZURE_STORAGE_ACCOUNT");
-        let access_key =
-            std::env::var("AZURE_STORAGE_ACCESS_KEY").expect("missing AZURE_STORAGE_ACCESS_KEY");
-
-        let credentials = StorageCredentials::access_key(account.clone(), access_key);
-
-        let builder = ClientBuilder::new(account, credentials);
-
-        let client = builder.container_client(azure_config.container_name.to_owned());
-
-        let max_keys_per_list_response =
-            if let Some(limit) = azure_config.max_keys_per_list_response {
-                Some(
-                    NonZeroU32::new(limit as u32)
-                        .ok_or_else(|| anyhow::anyhow!("max_keys_per_list_response can't be 0"))?,
-                )
-            } else {
-                None
-            };
-
-        Ok(AzureBlobStorage {
-            client,
-            prefix_in_container: azure_config.prefix_in_container.to_owned(),
-            max_keys_per_list_response,
-            concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
-        })
-    }
-
-    pub fn relative_path_to_name(&self, path: &RemotePath) -> String {
-        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
-        let path_string = path
-            .get_path()
-            .as_str()
-            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
-        match &self.prefix_in_container {
-            Some(prefix) => {
-                if prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
-                    prefix.clone() + path_string
-                } else {
-                    format!("{prefix}{REMOTE_STORAGE_PREFIX_SEPARATOR}{path_string}")
-                }
-            }
-            None => path_string.to_string(),
-        }
-    }
-
-    fn name_to_relative_path(&self, key: &str) -> RemotePath {
-        let relative_path =
-            match key.strip_prefix(self.prefix_in_container.as_deref().unwrap_or_default()) {
-                Some(stripped) => stripped,
-                // we rely on Azure to return properly prefixed paths
-                // for requests with a certain prefix
-                None => panic!(
-                    "Key {key} does not start with container prefix {:?}",
-                    self.prefix_in_container
-                ),
-            };
-        RemotePath(
-            relative_path
-                .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                .collect(),
-        )
-    }
-
-    async fn download_for_builder(
-        &self,
-        metadata: StorageMetadata,
-        builder: GetBlobBuilder,
-    ) -> Result<Download, DownloadError> {
-        let mut response = builder.into_stream();
-
-        // TODO give proper streaming response instead of buffering into RAM
-        // https://github.com/neondatabase/neon/issues/5563
-        let mut buf = Vec::new();
-        while let Some(part) = response.next().await {
-            let part = match part {
-                Ok(l) => l,
-                Err(e) => {
-                    return Err(if let Some(http_err) = e.as_http_error() {
-                        match http_err.status() {
-                            StatusCode::NotFound => DownloadError::NotFound,
-                            StatusCode::BadRequest => {
-                                DownloadError::BadInput(anyhow::Error::new(e))
-                            }
-                            _ => DownloadError::Other(anyhow::Error::new(e)),
-                        }
-                    } else {
-                        DownloadError::Other(e.into())
-                    });
-                }
-            };
-            let data = part
-                .data
-                .collect()
-                .await
-                .map_err(|e| DownloadError::Other(e.into()))?;
-            buf.extend_from_slice(&data.slice(..));
-        }
-        Ok(Download {
-            download_stream: Box::pin(Cursor::new(buf)),
-            metadata: Some(metadata),
-        })
-    }
-    // TODO get rid of this function once we have metadata included in the response
-    // https://github.com/Azure/azure-sdk-for-rust/issues/1439
-    async fn get_metadata(
-        &self,
-        blob_client: &BlobClient,
-    ) -> Result<StorageMetadata, DownloadError> {
-        let builder = blob_client.get_metadata();
-
-        match builder.into_future().await {
-            Ok(r) => {
-                let mut map = HashMap::new();
-
-                for md in r.metadata.iter() {
-                    map.insert(
-                        md.name().as_str().to_string(),
-                        md.value().as_str().to_string(),
-                    );
-                }
-                Ok(StorageMetadata(map))
-            }
-            Err(e) => {
-                return Err(if let Some(http_err) = e.as_http_error() {
-                    match http_err.status() {
-                        StatusCode::NotFound => DownloadError::NotFound,
-                        StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(e)),
-                        _ => DownloadError::Other(anyhow::Error::new(e)),
-                    }
-                } else {
-                    DownloadError::Other(e.into())
-                });
-            }
-        }
-    }
-
-    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
-        self.concurrency_limiter
-            .acquire(kind)
-            .await
-            .expect("semaphore is never closed")
-    }
-}
-
-fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
-    let mut res = Metadata::new();
-    for (k, v) in metadata.0.into_iter() {
-        res.insert(k, v);
-    }
-    res
-}
-
-#[async_trait::async_trait]
-impl RemoteStorage for AzureBlobStorage {
-    async fn list_prefixes(
-        &self,
-        prefix: Option<&RemotePath>,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        // get the passed prefix or if it is not set use prefix_in_bucket value
-        let list_prefix = prefix
-            .map(|p| self.relative_path_to_name(p))
-            .or_else(|| self.prefix_in_container.clone())
-            .map(|mut p| {
-                // required to end with a separator
-                // otherwise request will return only the entry of a prefix
-                if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
-                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-                }
-                p
-            });
-
-        let mut builder = self
-            .client
-            .list_blobs()
-            .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
-
-        if let Some(prefix) = list_prefix {
-            builder = builder.prefix(Cow::from(prefix.to_owned()));
-        }
-
-        if let Some(limit) = self.max_keys_per_list_response {
-            builder = builder.max_results(MaxResults::new(limit));
-        }
-
-        let mut response = builder.into_stream();
-        let mut res = Vec::new();
-        while let Some(l) = response.next().await {
-            let entry = match l {
-                Ok(l) => l,
-                Err(e) => {
-                    return Err(if let Some(http_err) = e.as_http_error() {
-                        match http_err.status() {
-                            StatusCode::NotFound => DownloadError::NotFound,
-                            StatusCode::BadRequest => {
-                                DownloadError::BadInput(anyhow::Error::new(e))
-                            }
-                            _ => DownloadError::Other(anyhow::Error::new(e)),
-                        }
-                    } else {
-                        DownloadError::Other(e.into())
-                    });
-                }
-            };
-            let name_iter = entry
-                .blobs
-                .prefixes()
-                .map(|prefix| self.name_to_relative_path(&prefix.name));
-            res.extend(name_iter);
-        }
-        Ok(res)
-    }
-
-    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        let folder_name = folder
-            .map(|p| self.relative_path_to_name(p))
-            .or_else(|| self.prefix_in_container.clone());
-
-        let mut builder = self.client.list_blobs();
-
-        if let Some(folder_name) = folder_name {
-            builder = builder.prefix(Cow::from(folder_name.to_owned()));
-        }
-
-        if let Some(limit) = self.max_keys_per_list_response {
-            builder = builder.max_results(MaxResults::new(limit));
-        }
-
-        let mut response = builder.into_stream();
-        let mut res = Vec::new();
-        while let Some(l) = response.next().await {
-            let entry = l.map_err(anyhow::Error::new)?;
-            let name_iter = entry
-                .blobs
-                .blobs()
-                .map(|bl| self.name_to_relative_path(&bl.name));
-            res.extend(name_iter);
-        }
-        Ok(res)
-    }
-
-    async fn upload(
-        &self,
-        mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
-        data_size_bytes: usize,
-        to: &RemotePath,
-        metadata: Option<StorageMetadata>,
-    ) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Put).await;
-        let blob_client = self.client.blob_client(self.relative_path_to_name(to));
-
-        // TODO FIX THIS UGLY HACK and don't buffer the entire object
-        // into RAM here, but use the streaming interface. For that,
-        // we'd have to change the interface though...
-        // https://github.com/neondatabase/neon/issues/5563
-        let mut buf = Vec::with_capacity(data_size_bytes);
-        tokio::io::copy(&mut from, &mut buf).await?;
-        let body = azure_core::Body::Bytes(buf.into());
-
-        let mut builder = blob_client.put_block_blob(body);
-
-        if let Some(metadata) = metadata {
-            builder = builder.metadata(to_azure_metadata(metadata));
-        }
-
-        let _response = builder.into_future().await?;
-
-        Ok(())
-    }
-
-    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
-        let _permit = self.permit(RequestKind::Get).await;
-        let blob_client = self.client.blob_client(self.relative_path_to_name(from));
-
-        let metadata = self.get_metadata(&blob_client).await?;
-
-        let builder = blob_client.get();
-
-        self.download_for_builder(metadata, builder).await
-    }
-
-    async fn download_byte_range(
-        &self,
-        from: &RemotePath,
-        start_inclusive: u64,
-        end_exclusive: Option<u64>,
-    ) -> Result<Download, DownloadError> {
-        let _permit = self.permit(RequestKind::Get).await;
-        let blob_client = self.client.blob_client(self.relative_path_to_name(from));
-
-        let metadata = self.get_metadata(&blob_client).await?;
-
-        let mut builder = blob_client.get();
-
-        if let Some(end_exclusive) = end_exclusive {
-            builder = builder.range(Range::new(start_inclusive, end_exclusive));
-        } else {
-            // Open ranges are not supported by the SDK so we work around
-            // by setting the upper limit extremely high (but high enough
-            // to still be representable by signed 64 bit integers).
-            // TODO remove workaround once the SDK adds open range support
-            // https://github.com/Azure/azure-sdk-for-rust/issues/1438
-            let end_exclusive = u64::MAX / 4;
-            builder = builder.range(Range::new(start_inclusive, end_exclusive));
-        }
-
-        self.download_for_builder(metadata, builder).await
-    }
-
-    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Delete).await;
-        let blob_client = self.client.blob_client(self.relative_path_to_name(path));
-
-        let builder = blob_client.delete();
-
-        match builder.into_future().await {
-            Ok(_response) => Ok(()),
-            Err(e) => {
-                if let Some(http_err) = e.as_http_error() {
-                    if http_err.status() == StatusCode::NotFound {
-                        return Ok(());
-                    }
-                }
-                Err(anyhow::Error::new(e))
-            }
-        }
-    }
-
-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        // Permit is already obtained by inner delete function
-
-        // TODO batch requests are also not supported by the SDK
-        // https://github.com/Azure/azure-sdk-for-rust/issues/1068
-        // https://github.com/Azure/azure-sdk-for-rust/issues/1249
-        for path in paths {
-            self.delete(path).await?;
-        }
-        Ok(())
-    }
-}
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -4,10 +4,7 @@
 //! [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
 //!   * [`local_fs`] allows to use local file system as an external storage
 //!   * [`s3_bucket`] uses AWS S3 bucket as an external storage
-//!   * [`azure_blob`] allows to use Azure Blob storage as an external storage
 //!
-
-mod azure_blob;
 mod local_fs;
 mod s3_bucket;
 mod simulate_failures;
@@ -24,15 +21,11 @@ use anyhow::{bail, Context};
 use camino::{Utf8Path, Utf8PathBuf};

 use serde::{Deserialize, Serialize};
-use tokio::{io, sync::Semaphore};
+use tokio::io;
 use toml_edit::Item;
 use tracing::info;

-pub use self::{
-    azure_blob::AzureBlobStorage, local_fs::LocalFs, s3_bucket::S3Bucket,
-    simulate_failures::UnreliableWrapper,
-};
-use s3_bucket::RequestKind;
+pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket, simulate_failures::UnreliableWrapper};

 /// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
 /// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
@@ -46,11 +39,6 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
 /// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
-/// We set this a little bit low as we currently buffer the entire file into RAM
-///
-/// Here, a limit of max 20k concurrent connections was noted.
-/// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
-pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
@@ -229,7 +217,6 @@ impl std::error::Error for DownloadError {}
 pub enum GenericRemoteStorage {
    LocalFs(LocalFs),
    AwsS3(Arc<S3Bucket>),
-    AzureBlob(Arc<AzureBlobStorage>),
    Unreliable(Arc<UnreliableWrapper>),
 }

@@ -241,7 +228,6 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.list_files(folder).await,
            Self::AwsS3(s) => s.list_files(folder).await,
-            Self::AzureBlob(s) => s.list_files(folder).await,
            Self::Unreliable(s) => s.list_files(folder).await,
        }
    }
@@ -256,7 +242,6 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.list_prefixes(prefix).await,
            Self::AwsS3(s) => s.list_prefixes(prefix).await,
-            Self::AzureBlob(s) => s.list_prefixes(prefix).await,
            Self::Unreliable(s) => s.list_prefixes(prefix).await,
        }
    }
@@ -271,7 +256,6 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await,
            Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await,
-            Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata).await,
            Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await,
        }
    }
@@ -280,7 +264,6 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.download(from).await,
            Self::AwsS3(s) => s.download(from).await,
-            Self::AzureBlob(s) => s.download(from).await,
            Self::Unreliable(s) => s.download(from).await,
        }
    }
@@ -300,10 +283,6 @@ impl GenericRemoteStorage {
                s.download_byte_range(from, start_inclusive, end_exclusive)
                    .await
            }
-            Self::AzureBlob(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive)
-                    .await
-            }
            Self::Unreliable(s) => {
                s.download_byte_range(from, start_inclusive, end_exclusive)
                    .await
@@ -315,7 +294,6 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.delete(path).await,
            Self::AwsS3(s) => s.delete(path).await,
-            Self::AzureBlob(s) => s.delete(path).await,
            Self::Unreliable(s) => s.delete(path).await,
        }
    }
@@ -324,7 +302,6 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.delete_objects(paths).await,
            Self::AwsS3(s) => s.delete_objects(paths).await,
-            Self::AzureBlob(s) => s.delete_objects(paths).await,
            Self::Unreliable(s) => s.delete_objects(paths).await,
        }
    }
@@ -342,11 +319,6 @@ impl GenericRemoteStorage {
                      s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
                Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
            }
-            RemoteStorageKind::AzureContainer(azure_config) => {
-                info!("Using azure container '{}' in region '{}' as a remote storage, prefix in container: '{:?}'",
-                      azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container);
-                Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config)?))
-            }
        })
    }

@@ -411,9 +383,6 @@ pub enum RemoteStorageKind {
    /// AWS S3 based storage, storing all files in the S3 bucket
    /// specified by the config
    AwsS3(S3Config),
-    /// Azure Blob based storage, storing all files in the container
-    /// specified by the config
-    AzureContainer(AzureConfig),
 }

 /// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
@@ -453,45 +422,11 @@ impl Debug for S3Config {
    }
 }

-/// Azure  bucket coordinates and access credentials to manage the bucket contents (read and write).
-#[derive(Clone, PartialEq, Eq)]
-pub struct AzureConfig {
-    /// Name of the container to connect to.
-    pub container_name: String,
-    /// The region where the bucket is located at.
-    pub container_region: String,
-    /// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once.
-    pub prefix_in_container: Option<String>,
-    /// Azure has various limits on its API calls, we need not to exceed those.
-    /// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details.
-    pub concurrency_limit: NonZeroUsize,
-    pub max_keys_per_list_response: Option<i32>,
-}
-
-impl Debug for AzureConfig {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("AzureConfig")
-            .field("bucket_name", &self.container_name)
-            .field("bucket_region", &self.container_region)
-            .field("prefix_in_bucket", &self.prefix_in_container)
-            .field("concurrency_limit", &self.concurrency_limit)
-            .field(
-                "max_keys_per_list_response",
-                &self.max_keys_per_list_response,
-            )
-            .finish()
-    }
-}
-
 impl RemoteStorageConfig {
    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
        let local_path = toml.get("local_path");
        let bucket_name = toml.get("bucket_name");
        let bucket_region = toml.get("bucket_region");
-        let container_name = toml.get("container_name");
-        let container_region = toml.get("container_region");
-
-        let use_azure = container_name.is_some() && container_region.is_some();

        let max_concurrent_syncs = NonZeroUsize::new(
            parse_optional_integer("max_concurrent_syncs", toml)?
@@ -505,13 +440,9 @@ impl RemoteStorageConfig {
        )
        .context("Failed to parse 'max_sync_errors' as a positive integer")?;

-        let default_concurrency_limit = if use_azure {
-            DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
-        } else {
-            DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
-        };
        let concurrency_limit = NonZeroUsize::new(
-            parse_optional_integer("concurrency_limit", toml)?.unwrap_or(default_concurrency_limit),
+            parse_optional_integer("concurrency_limit", toml)?
+                .unwrap_or(DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT),
        )
        .context("Failed to parse 'concurrency_limit' as a positive integer")?;

@@ -520,70 +451,33 @@ impl RemoteStorageConfig {
                .context("Failed to parse 'max_keys_per_list_response' as a positive integer")?
                .or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE);

-        let endpoint = toml
-            .get("endpoint")
-            .map(|endpoint| parse_toml_string("endpoint", endpoint))
-            .transpose()?;
-
-        let storage = match (
-            local_path,
-            bucket_name,
-            bucket_region,
-            container_name,
-            container_region,
-        ) {
+        let storage = match (local_path, bucket_name, bucket_region) {
            // no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled
-            (None, None, None, None, None) => return Ok(None),
-            (_, Some(_), None, ..) => {
+            (None, None, None) => return Ok(None),
+            (_, Some(_), None) => {
                bail!("'bucket_region' option is mandatory if 'bucket_name' is given ")
            }
-            (_, None, Some(_), ..) => {
+            (_, None, Some(_)) => {
                bail!("'bucket_name' option is mandatory if 'bucket_region' is given ")
            }
-            (None, Some(bucket_name), Some(bucket_region), ..) => {
-                RemoteStorageKind::AwsS3(S3Config {
-                    bucket_name: parse_toml_string("bucket_name", bucket_name)?,
-                    bucket_region: parse_toml_string("bucket_region", bucket_region)?,
-                    prefix_in_bucket: toml
-                        .get("prefix_in_bucket")
-                        .map(|prefix_in_bucket| {
-                            parse_toml_string("prefix_in_bucket", prefix_in_bucket)
-                        })
-                        .transpose()?,
-                    endpoint,
-                    concurrency_limit,
-                    max_keys_per_list_response,
-                })
-            }
-            (_, _, _, Some(_), None) => {
-                bail!("'container_name' option is mandatory if 'container_region' is given ")
-            }
-            (_, _, _, None, Some(_)) => {
-                bail!("'container_name' option is mandatory if 'container_region' is given ")
-            }
-            (None, None, None, Some(container_name), Some(container_region)) => {
-                RemoteStorageKind::AzureContainer(AzureConfig {
-                    container_name: parse_toml_string("container_name", container_name)?,
-                    container_region: parse_toml_string("container_region", container_region)?,
-                    prefix_in_container: toml
-                        .get("prefix_in_container")
-                        .map(|prefix_in_container| {
-                            parse_toml_string("prefix_in_container", prefix_in_container)
-                        })
-                        .transpose()?,
-                    concurrency_limit,
-                    max_keys_per_list_response,
-                })
-            }
-            (Some(local_path), None, None, None, None) => RemoteStorageKind::LocalFs(
-                Utf8PathBuf::from(parse_toml_string("local_path", local_path)?),
-            ),
-            (Some(_), Some(_), ..) => {
-                bail!("'local_path' and 'bucket_name' are mutually exclusive")
-            }
-            (Some(_), _, _, Some(_), Some(_)) => {
-                bail!("local_path and 'container_name' are mutually exclusive")
-            }
+            (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config {
+                bucket_name: parse_toml_string("bucket_name", bucket_name)?,
+                bucket_region: parse_toml_string("bucket_region", bucket_region)?,
+                prefix_in_bucket: toml
+                    .get("prefix_in_bucket")
+                    .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket))
+                    .transpose()?,
+                endpoint: toml
+                    .get("endpoint")
+                    .map(|endpoint| parse_toml_string("endpoint", endpoint))
+                    .transpose()?,
+                concurrency_limit,
+                max_keys_per_list_response,
+            }),
+            (Some(local_path), None, None) => RemoteStorageKind::LocalFs(Utf8PathBuf::from(
+                parse_toml_string("local_path", local_path)?,
+            )),
+            (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"),
        };

        Ok(Some(RemoteStorageConfig {
@@ -619,46 +513,6 @@ fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result<String> {
    Ok(s.to_string())
 }

-struct ConcurrencyLimiter {
-    // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
-    // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
-    // The helps to ensure we don't exceed the thresholds.
-    write: Arc<Semaphore>,
-    read: Arc<Semaphore>,
-}
-
-impl ConcurrencyLimiter {
-    fn for_kind(&self, kind: RequestKind) -> &Arc<Semaphore> {
-        match kind {
-            RequestKind::Get => &self.read,
-            RequestKind::Put => &self.write,
-            RequestKind::List => &self.read,
-            RequestKind::Delete => &self.write,
-        }
-    }
-
-    async fn acquire(
-        &self,
-        kind: RequestKind,
-    ) -> Result<tokio::sync::SemaphorePermit<'_>, tokio::sync::AcquireError> {
-        self.for_kind(kind).acquire().await
-    }
-
-    async fn acquire_owned(
-        &self,
-        kind: RequestKind,
-    ) -> Result<tokio::sync::OwnedSemaphorePermit, tokio::sync::AcquireError> {
-        Arc::clone(self.for_kind(kind)).acquire_owned().await
-    }
-
-    fn new(limit: usize) -> ConcurrencyLimiter {
-        Self {
-            read: Arc::new(Semaphore::new(limit)),
-            write: Arc::new(Semaphore::new(limit)),
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,7 +4,7 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.

-use std::borrow::Cow;
+use std::sync::Arc;

 use anyhow::Context;
 use aws_config::{
@@ -24,20 +24,22 @@ use aws_sdk_s3::{
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
 use scopeguard::ScopeGuard;
-use tokio::io::{self, AsyncRead};
+use tokio::{
+    io::{self, AsyncRead},
+    sync::Semaphore,
+};
 use tokio_util::io::ReaderStream;
 use tracing::debug;

 use super::StorageMetadata;
 use crate::{
-    ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage, S3Config,
-    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, RemotePath, RemoteStorage, S3Config, MAX_KEYS_PER_DELETE,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 pub(super) mod metrics;

-use self::metrics::AttemptOutcome;
-pub(super) use self::metrics::RequestKind;
+use self::metrics::{AttemptOutcome, RequestKind};

 /// AWS S3 storage.
 pub struct S3Bucket {
@@ -48,6 +50,46 @@ pub struct S3Bucket {
    concurrency_limiter: ConcurrencyLimiter,
 }

+struct ConcurrencyLimiter {
+    // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
+    // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
+    // The helps to ensure we don't exceed the thresholds.
+    write: Arc<Semaphore>,
+    read: Arc<Semaphore>,
+}
+
+impl ConcurrencyLimiter {
+    fn for_kind(&self, kind: RequestKind) -> &Arc<Semaphore> {
+        match kind {
+            RequestKind::Get => &self.read,
+            RequestKind::Put => &self.write,
+            RequestKind::List => &self.read,
+            RequestKind::Delete => &self.write,
+        }
+    }
+
+    async fn acquire(
+        &self,
+        kind: RequestKind,
+    ) -> Result<tokio::sync::SemaphorePermit<'_>, tokio::sync::AcquireError> {
+        self.for_kind(kind).acquire().await
+    }
+
+    async fn acquire_owned(
+        &self,
+        kind: RequestKind,
+    ) -> Result<tokio::sync::OwnedSemaphorePermit, tokio::sync::AcquireError> {
+        Arc::clone(self.for_kind(kind)).acquire_owned().await
+    }
+
+    fn new(limit: usize) -> ConcurrencyLimiter {
+        Self {
+            read: Arc::new(Semaphore::new(limit)),
+            write: Arc::new(Semaphore::new(limit)),
+        }
+    }
+}
+
 #[derive(Default)]
 struct GetObjectRequest {
    bucket: String,
@@ -514,20 +556,6 @@ impl RemoteStorage for S3Bucket {
                        .deleted_objects_total
                        .inc_by(chunk.len() as u64);
                    if let Some(errors) = resp.errors {
-                        // Log a bounded number of the errors within the response:
-                        // these requests can carry 1000 keys so logging each one
-                        // would be too verbose, especially as errors may lead us
-                        // to retry repeatedly.
-                        const LOG_UP_TO_N_ERRORS: usize = 10;
-                        for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
-                            tracing::warn!(
-                                "DeleteObjects key {} failed: {}: {}",
-                                e.key.as_ref().map(Cow::from).unwrap_or("".into()),
-                                e.code.as_ref().map(Cow::from).unwrap_or("".into()),
-                                e.message.as_ref().map(Cow::from).unwrap_or("".into())
-                            );
-                        }
-
                        return Err(anyhow::format_err!(
                            "Failed to delete {} objects",
                            errors.len()
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -6,7 +6,7 @@ use once_cell::sync::Lazy;
 pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);

 #[derive(Clone, Copy, Debug)]
-pub(crate) enum RequestKind {
+pub(super) enum RequestKind {
    Get = 0,
    Put = 1,
    Delete = 2,
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -1,619 +0,0 @@
-use std::collections::HashSet;
-use std::env;
-use std::num::{NonZeroU32, NonZeroUsize};
-use std::ops::ControlFlow;
-use std::path::PathBuf;
-use std::sync::Arc;
-use std::time::UNIX_EPOCH;
-
-use anyhow::Context;
-use camino::Utf8Path;
-use once_cell::sync::OnceCell;
-use remote_storage::{
-    AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
-};
-use test_context::{test_context, AsyncTestContext};
-use tokio::task::JoinSet;
-use tracing::{debug, error, info};
-
-static LOGGING_DONE: OnceCell<()> = OnceCell::new();
-
-const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE";
-
-const BASE_PREFIX: &str = "test";
-
-/// Tests that the Azure client can list all prefixes, even if the response comes paginated and requires multiple HTTP queries.
-/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified.
-/// See the client creation in [`create_azure_client`] for details on the required env vars.
-/// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
-/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
-///
-/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_azure_data`]
-/// where
-/// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference
-/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
-///
-/// Then, verifies that the client does return correct prefixes when queried:
-/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
-/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
-///
-/// With the real Azure enabled and `#[cfg(test)]` Rust configuration used, the Azure client test adds a `max-keys` param to limit the response keys.
-/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to Azure.
-///
-/// Lastly, the test attempts to clean up and remove all uploaded Azure files.
-/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
-#[test_context(MaybeEnabledAzureWithTestBlobs)]
-#[tokio::test]
-async fn azure_pagination_should_work(
-    ctx: &mut MaybeEnabledAzureWithTestBlobs,
-) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledAzureWithTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledAzureWithTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledAzureWithTestBlobs::UploadsFailed(e, _) => {
-            anyhow::bail!("Azure init failed: {e:?}")
-        }
-    };
-
-    let test_client = Arc::clone(&ctx.enabled.client);
-    let expected_remote_prefixes = ctx.remote_prefixes.clone();
-
-    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
-        .context("common_prefix construction")?;
-    let root_remote_prefixes = test_client
-        .list_prefixes(None)
-        .await
-        .context("client list root prefixes failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
-        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
-    );
-
-    let nested_remote_prefixes = test_client
-        .list_prefixes(Some(&base_prefix))
-        .await
-        .context("client list nested prefixes failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    let remote_only_prefixes = nested_remote_prefixes
-        .difference(&expected_remote_prefixes)
-        .collect::<HashSet<_>>();
-    let missing_uploaded_prefixes = expected_remote_prefixes
-        .difference(&nested_remote_prefixes)
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
-        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
-    );
-
-    Ok(())
-}
-
-/// Tests that Azure client can list all files in a folder, even if the response comes paginated and requirees multiple Azure queries.
-/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set.
-/// See `Azure_pagination_should_work` for more information.
-///
-/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_azure_data`]
-/// Then performs the following queries:
-///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
-///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
-#[test_context(MaybeEnabledAzureWithSimpleTestBlobs)]
-#[tokio::test]
-async fn azure_list_files_works(
-    ctx: &mut MaybeEnabledAzureWithSimpleTestBlobs,
-) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledAzureWithSimpleTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledAzureWithSimpleTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledAzureWithSimpleTestBlobs::UploadsFailed(e, _) => {
-            anyhow::bail!("Azure init failed: {e:?}")
-        }
-    };
-    let test_client = Arc::clone(&ctx.enabled.client);
-    let base_prefix =
-        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
-    let root_files = test_client
-        .list_files(None)
-        .await
-        .context("client list root files failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        root_files,
-        ctx.remote_blobs.clone(),
-        "remote storage list_files on root mismatches with the uploads."
-    );
-    let nested_remote_files = test_client
-        .list_files(Some(&base_prefix))
-        .await
-        .context("client list nested files failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    let trim_remote_blobs: HashSet<_> = ctx
-        .remote_blobs
-        .iter()
-        .map(|x| x.get_path())
-        .filter(|x| x.starts_with("folder1"))
-        .map(|x| RemotePath::new(x).expect("must be valid path"))
-        .collect();
-    assert_eq!(
-        nested_remote_files, trim_remote_blobs,
-        "remote storage list_files on subdirrectory mismatches with the uploads."
-    );
-    Ok(())
-}
-
-#[test_context(MaybeEnabledAzure)]
-#[tokio::test]
-async fn azure_delete_non_exising_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledAzure::Enabled(ctx) => ctx,
-        MaybeEnabledAzure::Disabled => return Ok(()),
-    };
-
-    let path = RemotePath::new(Utf8Path::new(
-        format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
-    ))
-    .with_context(|| "RemotePath conversion")?;
-
-    ctx.client.delete(&path).await.expect("should succeed");
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledAzure)]
-#[tokio::test]
-async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledAzure::Enabled(ctx) => ctx,
-        MaybeEnabledAzure::Disabled => return Ok(()),
-    };
-
-    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let data1 = "remote blob data1".as_bytes();
-    let data1_len = data1.len();
-    let data2 = "remote blob data2".as_bytes();
-    let data2_len = data2.len();
-    let data3 = "remote blob data3".as_bytes();
-    let data3_len = data3.len();
-    ctx.client
-        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
-        .await?;
-
-    ctx.client
-        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
-        .await?;
-
-    ctx.client
-        .upload(std::io::Cursor::new(data3), data3_len, &path3, None)
-        .await?;
-
-    ctx.client.delete_objects(&[path1, path2]).await?;
-
-    let prefixes = ctx.client.list_prefixes(None).await?;
-
-    assert_eq!(prefixes.len(), 1);
-
-    ctx.client.delete_objects(&[path3]).await?;
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledAzure)]
-#[tokio::test]
-async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
-    let MaybeEnabledAzure::Enabled(ctx) = ctx else {
-        return Ok(());
-    };
-
-    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let data = "remote blob data here".as_bytes();
-    let data_len = data.len() as u64;
-
-    ctx.client
-        .upload(std::io::Cursor::new(data), data.len(), &path, None)
-        .await?;
-
-    async fn download_and_compare(mut dl: Download) -> anyhow::Result<Vec<u8>> {
-        let mut buf = Vec::new();
-        tokio::io::copy(&mut dl.download_stream, &mut buf).await?;
-        Ok(buf)
-    }
-    // Normal download request
-    let dl = ctx.client.download(&path).await?;
-    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data);
-
-    // Full range (end specified)
-    let dl = ctx
-        .client
-        .download_byte_range(&path, 0, Some(data_len))
-        .await?;
-    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data);
-
-    // partial range (end specified)
-    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
-    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data[4..10]);
-
-    // partial range (end beyond real end)
-    let dl = ctx
-        .client
-        .download_byte_range(&path, 8, Some(data_len * 100))
-        .await?;
-    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data[8..]);
-
-    // Partial range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
-    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data[4..]);
-
-    // Full range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
-    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data);
-
-    Ok(())
-}
-
-fn ensure_logging_ready() {
-    LOGGING_DONE.get_or_init(|| {
-        utils::logging::init(
-            utils::logging::LogFormat::Test,
-            utils::logging::TracingErrorLayerEnablement::Disabled,
-        )
-        .expect("logging init failed");
-    });
-}
-
-struct EnabledAzure {
-    client: Arc<GenericRemoteStorage>,
-    base_prefix: &'static str,
-}
-
-impl EnabledAzure {
-    async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
-        let client = create_azure_client(max_keys_in_list_response)
-            .context("Azure client creation")
-            .expect("Azure client creation failed");
-
-        EnabledAzure {
-            client,
-            base_prefix: BASE_PREFIX,
-        }
-    }
-}
-
-enum MaybeEnabledAzure {
-    Enabled(EnabledAzure),
-    Disabled,
-}
-
-#[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledAzure {
-    async fn setup() -> Self {
-        ensure_logging_ready();
-
-        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
-            info!(
-                "`{}` env variable is not set, skipping the test",
-                ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME
-            );
-            return Self::Disabled;
-        }
-
-        Self::Enabled(EnabledAzure::setup(None).await)
-    }
-}
-
-enum MaybeEnabledAzureWithTestBlobs {
-    Enabled(AzureWithTestBlobs),
-    Disabled,
-    UploadsFailed(anyhow::Error, AzureWithTestBlobs),
-}
-
-struct AzureWithTestBlobs {
-    enabled: EnabledAzure,
-    remote_prefixes: HashSet<RemotePath>,
-    remote_blobs: HashSet<RemotePath>,
-}
-
-#[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs {
-    async fn setup() -> Self {
-        ensure_logging_ready();
-        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
-            info!(
-                "`{}` env variable is not set, skipping the test",
-                ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME
-            );
-            return Self::Disabled;
-        }
-
-        let max_keys_in_list_response = 10;
-        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
-
-        let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
-
-        match upload_azure_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
-            ControlFlow::Continue(uploads) => {
-                info!("Remote objects created successfully");
-
-                Self::Enabled(AzureWithTestBlobs {
-                    enabled,
-                    remote_prefixes: uploads.prefixes,
-                    remote_blobs: uploads.blobs,
-                })
-            }
-            ControlFlow::Break(uploads) => Self::UploadsFailed(
-                anyhow::anyhow!("One or multiple blobs failed to upload to Azure"),
-                AzureWithTestBlobs {
-                    enabled,
-                    remote_prefixes: uploads.prefixes,
-                    remote_blobs: uploads.blobs,
-                },
-            ),
-        }
-    }
-
-    async fn teardown(self) {
-        match self {
-            Self::Disabled => {}
-            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
-                cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
-            }
-        }
-    }
-}
-
-// NOTE: the setups for the list_prefixes test and the list_files test are very similar
-// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
-// whereas the list_files function is concerned with listing files.
-// See `RemoteStorage::list_files` documentation for more details
-enum MaybeEnabledAzureWithSimpleTestBlobs {
-    Enabled(AzureWithSimpleTestBlobs),
-    Disabled,
-    UploadsFailed(anyhow::Error, AzureWithSimpleTestBlobs),
-}
-struct AzureWithSimpleTestBlobs {
-    enabled: EnabledAzure,
-    remote_blobs: HashSet<RemotePath>,
-}
-
-#[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs {
-    async fn setup() -> Self {
-        ensure_logging_ready();
-        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
-            info!(
-                "`{}` env variable is not set, skipping the test",
-                ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME
-            );
-            return Self::Disabled;
-        }
-
-        let max_keys_in_list_response = 10;
-        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
-
-        let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
-
-        match upload_simple_azure_data(&enabled.client, upload_tasks_count).await {
-            ControlFlow::Continue(uploads) => {
-                info!("Remote objects created successfully");
-
-                Self::Enabled(AzureWithSimpleTestBlobs {
-                    enabled,
-                    remote_blobs: uploads,
-                })
-            }
-            ControlFlow::Break(uploads) => Self::UploadsFailed(
-                anyhow::anyhow!("One or multiple blobs failed to upload to Azure"),
-                AzureWithSimpleTestBlobs {
-                    enabled,
-                    remote_blobs: uploads,
-                },
-            ),
-        }
-    }
-
-    async fn teardown(self) {
-        match self {
-            Self::Disabled => {}
-            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
-                cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
-            }
-        }
-    }
-}
-
-fn create_azure_client(
-    max_keys_per_list_response: Option<i32>,
-) -> anyhow::Result<Arc<GenericRemoteStorage>> {
-    use rand::Rng;
-
-    let remote_storage_azure_container = env::var("REMOTE_STORAGE_AZURE_CONTAINER").context(
-        "`REMOTE_STORAGE_AZURE_CONTAINER` env var is not set, but real Azure tests are enabled",
-    )?;
-    let remote_storage_azure_region = env::var("REMOTE_STORAGE_AZURE_REGION").context(
-        "`REMOTE_STORAGE_AZURE_REGION` env var is not set, but real Azure tests are enabled",
-    )?;
-
-    // due to how time works, we've had test runners use the same nanos as bucket prefixes.
-    // millis is just a debugging aid for easier finding the prefix later.
-    let millis = std::time::SystemTime::now()
-        .duration_since(UNIX_EPOCH)
-        .context("random Azure test prefix part calculation")?
-        .as_millis();
-
-    // because nanos can be the same for two threads so can millis, add randomness
-    let random = rand::thread_rng().gen::<u32>();
-
-    let remote_storage_config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
-        max_sync_errors: NonZeroU32::new(5).unwrap(),
-        storage: RemoteStorageKind::AzureContainer(AzureConfig {
-            container_name: remote_storage_azure_container,
-            container_region: remote_storage_azure_region,
-            prefix_in_container: Some(format!("test_{millis}_{random:08x}/")),
-            concurrency_limit: NonZeroUsize::new(100).unwrap(),
-            max_keys_per_list_response,
-        }),
-    };
-    Ok(Arc::new(
-        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
-    ))
-}
-
-struct Uploads {
-    prefixes: HashSet<RemotePath>,
-    blobs: HashSet<RemotePath>,
-}
-
-async fn upload_azure_data(
-    client: &Arc<GenericRemoteStorage>,
-    base_prefix_str: &'static str,
-    upload_tasks_count: usize,
-) -> ControlFlow<Uploads, Uploads> {
-    info!("Creating {upload_tasks_count} Azure files");
-    let mut upload_tasks = JoinSet::new();
-    for i in 1..upload_tasks_count + 1 {
-        let task_client = Arc::clone(client);
-        upload_tasks.spawn(async move {
-            let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
-            let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
-                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
-            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
-            debug!("Creating remote item {i} at path {blob_path:?}");
-
-            let data = format!("remote blob data {i}").into_bytes();
-            let data_len = data.len();
-            task_client
-                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
-                .await?;
-
-            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
-        });
-    }
-
-    let mut upload_tasks_failed = false;
-    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
-    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
-    while let Some(task_run_result) = upload_tasks.join_next().await {
-        match task_run_result
-            .context("task join failed")
-            .and_then(|task_result| task_result.context("upload task failed"))
-        {
-            Ok((upload_prefix, upload_path)) => {
-                uploaded_prefixes.insert(upload_prefix);
-                uploaded_blobs.insert(upload_path);
-            }
-            Err(e) => {
-                error!("Upload task failed: {e:?}");
-                upload_tasks_failed = true;
-            }
-        }
-    }
-
-    let uploads = Uploads {
-        prefixes: uploaded_prefixes,
-        blobs: uploaded_blobs,
-    };
-    if upload_tasks_failed {
-        ControlFlow::Break(uploads)
-    } else {
-        ControlFlow::Continue(uploads)
-    }
-}
-
-async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
-    info!(
-        "Removing {} objects from the remote storage during cleanup",
-        objects_to_delete.len()
-    );
-    let mut delete_tasks = JoinSet::new();
-    for object_to_delete in objects_to_delete {
-        let task_client = Arc::clone(client);
-        delete_tasks.spawn(async move {
-            debug!("Deleting remote item at path {object_to_delete:?}");
-            task_client
-                .delete(&object_to_delete)
-                .await
-                .with_context(|| format!("{object_to_delete:?} removal"))
-        });
-    }
-
-    while let Some(task_run_result) = delete_tasks.join_next().await {
-        match task_run_result {
-            Ok(task_result) => match task_result {
-                Ok(()) => {}
-                Err(e) => error!("Delete task failed: {e:?}"),
-            },
-            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
-        }
-    }
-}
-
-// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
-async fn upload_simple_azure_data(
-    client: &Arc<GenericRemoteStorage>,
-    upload_tasks_count: usize,
-) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
-    info!("Creating {upload_tasks_count} Azure files");
-    let mut upload_tasks = JoinSet::new();
-    for i in 1..upload_tasks_count + 1 {
-        let task_client = Arc::clone(client);
-        upload_tasks.spawn(async move {
-            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
-            let blob_path = RemotePath::new(
-                Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
-            )
-            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
-            debug!("Creating remote item {i} at path {blob_path:?}");
-
-            let data = format!("remote blob data {i}").into_bytes();
-            let data_len = data.len();
-            task_client
-                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
-                .await?;
-
-            Ok::<_, anyhow::Error>(blob_path)
-        });
-    }
-
-    let mut upload_tasks_failed = false;
-    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
-    while let Some(task_run_result) = upload_tasks.join_next().await {
-        match task_run_result
-            .context("task join failed")
-            .and_then(|task_result| task_result.context("upload task failed"))
-        {
-            Ok(upload_path) => {
-                uploaded_blobs.insert(upload_path);
-            }
-            Err(e) => {
-                error!("Upload task failed: {e:?}");
-                upload_tasks_failed = true;
-            }
-        }
-    }
-
-    if upload_tasks_failed {
-        ControlFlow::Break(uploaded_blobs)
-    } else {
-        ControlFlow::Continue(uploaded_blobs)
-    }
-}
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -1,9 +1,8 @@
 use hyper::{header, Body, Response, StatusCode};
 use serde::{Deserialize, Serialize};
-use std::borrow::Cow;
 use std::error::Error as StdError;
 use thiserror::Error;
-use tracing::{error, info};
+use tracing::error;

 #[derive(Debug, Error)]
 pub enum ApiError {
@@ -26,7 +25,7 @@ pub enum ApiError {
    PreconditionFailed(Box<str>),

    #[error("Resource temporarily unavailable: {0}")]
-    ResourceUnavailable(Cow<'static, str>),
+    ResourceUnavailable(String),

    #[error("Shutting down")]
    ShuttingDown,
@@ -116,12 +115,10 @@ pub async fn route_error_handler(err: routerify::RouteError) -> Response<Body> {

 pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
    // Print a stack trace for Internal Server errors
-
-    match api_error {
-        ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
-        ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
-        ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
-        _ => error!("Error processing HTTP request: {api_error:#}"),
+    if let ApiError::InternalServerError(_) = api_error {
+        error!("Error processing HTTP request: {api_error:?}");
+    } else {
+        error!("Error processing HTTP request: {api_error:#}");
    }

    api_error.into_response()
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -58,7 +58,7 @@ where
 // to get that.
 impl<T: Ord> PartialOrd for Waiter<T> {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
+        other.wake_num.partial_cmp(&self.wake_num)
    }
 }

--- a/libs/vm_monitor/README.md
+++ b/libs/vm_monitor/README.md
@@ -27,8 +27,8 @@ and old one if it exists.
 * the filecache: a struct that allows communication with the Postgres file cache.
 On startup, we connect to the filecache and hold on to the connection for the
 entire monitor lifetime.
-* the cgroup watcher: the `CgroupWatcher` polls the `neon-postgres` cgroup's memory
-usage and sends rolling aggregates to the runner.
+* the cgroup watcher: the `CgroupWatcher` manages the `neon-postgres` cgroup by
+listening for `memory.high` events and setting its `memory.{high,max}` values.
 * the runner: the runner marries the filecache and cgroup watcher together,
 communicating with the agent throught the `Dispatcher`, and then calling filecache
 and cgroup watcher functions as needed to upscale and downscale
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -1,38 +1,161 @@
-use std::fmt::{self, Debug, Formatter};
-use std::time::{Duration, Instant};
-
-use anyhow::{anyhow, Context};
-use cgroups_rs::{
-    hierarchies::{self, is_cgroup2_unified_mode},
-    memory::MemController,
-    Subsystem,
+use std::{
+    fmt::{Debug, Display},
+    fs,
+    pin::pin,
+    sync::atomic::{AtomicU64, Ordering},
 };
-use tokio::sync::watch;
+
+use anyhow::{anyhow, bail, Context};
+use cgroups_rs::{
+    freezer::FreezerController,
+    hierarchies::{self, is_cgroup2_unified_mode, UNIFIED_MOUNTPOINT},
+    memory::MemController,
+    MaxValue,
+    Subsystem::{Freezer, Mem},
+};
+use inotify::{EventStream, Inotify, WatchMask};
+use tokio::sync::mpsc::{self, error::TryRecvError};
+use tokio::time::{Duration, Instant};
+use tokio_stream::{Stream, StreamExt};
 use tracing::{info, warn};

+use crate::protocol::Resources;
+use crate::MiB;
+
+/// Monotonically increasing counter of the number of memory.high events
+/// the cgroup has experienced.
+///
+/// We use this to determine if a modification to the `memory.events` file actually
+/// changed the `high` field. If not, we don't care about the change. When we
+/// read the file, we check the `high` field in the file against `MEMORY_EVENT_COUNT`
+/// to see if it changed since last time.
+pub static MEMORY_EVENT_COUNT: AtomicU64 = AtomicU64::new(0);
+
+/// Monotonically increasing counter that gives each cgroup event a unique id.
+///
+/// This allows us to answer questions like "did this upscale arrive before this
+/// memory.high?". This static is also used by the `Sequenced` type to "tag" values
+/// with a sequence number. As such, prefer to used the `Sequenced` type rather
+/// than this static directly.
+static EVENT_SEQUENCE_NUMBER: AtomicU64 = AtomicU64::new(0);
+
+/// A memory event type reported in memory.events.
+#[derive(Debug, Eq, PartialEq, Copy, Clone)]
+pub enum MemoryEvent {
+    Low,
+    High,
+    Max,
+    Oom,
+    OomKill,
+    OomGroupKill,
+}
+
+impl MemoryEvent {
+    fn as_str(&self) -> &str {
+        match self {
+            MemoryEvent::Low => "low",
+            MemoryEvent::High => "high",
+            MemoryEvent::Max => "max",
+            MemoryEvent::Oom => "oom",
+            MemoryEvent::OomKill => "oom_kill",
+            MemoryEvent::OomGroupKill => "oom_group_kill",
+        }
+    }
+}
+
+impl Display for MemoryEvent {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
 /// Configuration for a `CgroupWatcher`
 #[derive(Debug, Clone)]
 pub struct Config {
-    /// Interval at which we should be fetching memory statistics
-    memory_poll_interval: Duration,
+    // The target difference between the total memory reserved for the cgroup
+    // and the value of the cgroup's memory.high.
+    //
+    // In other words, memory.high + oom_buffer_bytes will equal the total memory that the cgroup may
+    // use (equal to system memory, minus whatever's taken out for the file cache).
+    oom_buffer_bytes: u64,

-    /// The number of samples used in constructing aggregated memory statistics
-    memory_history_len: usize,
-    /// The number of most recent samples that will be periodically logged.
-    ///
-    /// Each sample is logged exactly once. Increasing this value means that recent samples will be
-    /// logged less frequently, and vice versa.
-    ///
-    /// For simplicity, this value must be greater than or equal to `memory_history_len`.
-    memory_history_log_interval: usize,
+    // The amount of memory, in bytes, below a proposed new value for
+    // memory.high that the cgroup's memory usage must be for us to downscale
+    //
+    // In other words, we can downscale only when:
+    //
+    //   memory.current + memory_high_buffer_bytes < (proposed) memory.high
+    //
+    // TODO: there's some minor issues with this approach -- in particular, that we might have
+    // memory in use by the kernel's page cache that we're actually ok with getting rid of.
+    pub(crate) memory_high_buffer_bytes: u64,
+
+    // The maximum duration, in milliseconds, that we're allowed to pause
+    // the cgroup for while waiting for the autoscaler-agent to upscale us
+    max_upscale_wait: Duration,
+
+    // The required minimum time, in milliseconds, that we must wait before re-freezing
+    // the cgroup while waiting for the autoscaler-agent to upscale us.
+    do_not_freeze_more_often_than: Duration,
+
+    // The amount of memory, in bytes, that we should periodically increase memory.high
+    // by while waiting for the autoscaler-agent to upscale us.
+    //
+    // This exists to avoid the excessive throttling that happens when a cgroup is above its
+    // memory.high for too long. See more here:
+    // https://github.com/neondatabase/autoscaling/issues/44#issuecomment-1522487217
+    memory_high_increase_by_bytes: u64,
+
+    // The period, in milliseconds, at which we should repeatedly increase the value
+    // of the cgroup's memory.high while we're waiting on upscaling and memory.high
+    // is still being hit.
+    //
+    // Technically speaking, this actually serves as a rate limit to moderate responding to
+    // memory.high events, but these are roughly equivalent if the process is still allocating
+    // memory.
+    memory_high_increase_every: Duration,
+}
+
+impl Config {
+    /// Calculate the new value for the cgroups memory.high based on system memory
+    pub fn calculate_memory_high_value(&self, total_system_mem: u64) -> u64 {
+        total_system_mem.saturating_sub(self.oom_buffer_bytes)
+    }
 }

 impl Default for Config {
    fn default() -> Self {
        Self {
-            memory_poll_interval: Duration::from_millis(100),
-            memory_history_len: 5, // use 500ms of history for decision-making
-            memory_history_log_interval: 20, // but only log every ~2s (otherwise it's spammy)
+            oom_buffer_bytes: 100 * MiB,
+            memory_high_buffer_bytes: 100 * MiB,
+            // while waiting for upscale, don't freeze for more than 20ms every 1s
+            max_upscale_wait: Duration::from_millis(20),
+            do_not_freeze_more_often_than: Duration::from_millis(1000),
+            // while waiting for upscale, increase memory.high by 10MiB every 25ms
+            memory_high_increase_by_bytes: 10 * MiB,
+            memory_high_increase_every: Duration::from_millis(25),
+        }
+    }
+}
+
+/// Used to represent data that is associated with a certain point in time, such
+/// as an upscale request or memory.high event.
+///
+/// Internally, creating a `Sequenced` uses a static atomic counter to obtain
+/// a unique sequence number. Sequence numbers are monotonically increasing,
+/// allowing us to answer questions like "did this upscale happen after this
+/// memory.high event?" by comparing the sequence numbers of the two events.
+#[derive(Debug, Clone)]
+pub struct Sequenced<T> {
+    seqnum: u64,
+    data: T,
+}
+
+impl<T> Sequenced<T> {
+    pub fn new(data: T) -> Self {
+        Self {
+            seqnum: EVENT_SEQUENCE_NUMBER.fetch_add(1, Ordering::AcqRel),
+            data,
        }
    }
 }
@@ -47,14 +170,74 @@ impl Default for Config {
 pub struct CgroupWatcher {
    pub config: Config,

+    /// The sequence number of the last upscale.
+    ///
+    /// If we receive a memory.high event that has a _lower_ sequence number than
+    /// `last_upscale_seqnum`, then we know it occured before the upscale, and we
+    /// can safely ignore it.
+    ///
+    /// Note: Like the `events` field, this doesn't _need_ interior mutability but we
+    /// use it anyways so that methods take `&self`, not `&mut self`.
+    last_upscale_seqnum: AtomicU64,
+
+    /// A channel on which we send messages to request upscale from the dispatcher.
+    upscale_requester: mpsc::Sender<()>,
+
    /// The actual cgroup we are watching and managing.
    cgroup: cgroups_rs::Cgroup,
 }

+/// Read memory.events for the desired event type.
+///
+/// `path` specifies the path to the desired `memory.events` file.
+/// For more info, see the `memory.events` section of the [kernel docs]
+/// <https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files>
+fn get_event_count(path: &str, event: MemoryEvent) -> anyhow::Result<u64> {
+    let contents = fs::read_to_string(path)
+        .with_context(|| format!("failed to read memory.events from {path}"))?;
+
+    // Then contents of the file look like:
+    // low 42
+    // high 101
+    // ...
+    contents
+        .lines()
+        .filter_map(|s| s.split_once(' '))
+        .find(|(e, _)| *e == event.as_str())
+        .ok_or_else(|| anyhow!("failed to find entry for memory.{event} events in {path}"))
+        .and_then(|(_, count)| {
+            count
+                .parse::<u64>()
+                .with_context(|| format!("failed to parse memory.{event} as u64"))
+        })
+}
+
+/// Create an event stream that produces events whenever the file at the provided
+/// path is modified.
+fn create_file_watcher(path: &str) -> anyhow::Result<EventStream<[u8; 1024]>> {
+    info!("creating file watcher for {path}");
+    let inotify = Inotify::init().context("failed to initialize file watcher")?;
+    inotify
+        .watches()
+        .add(path, WatchMask::MODIFY)
+        .with_context(|| format!("failed to start watching {path}"))?;
+    inotify
+        // The inotify docs use [0u8; 1024] so we'll just copy them. We only need
+        // to store one event at a time - if the event gets written over, that's
+        // ok. We still see that there is an event. For more information, see:
+        // https://man7.org/linux/man-pages/man7/inotify.7.html
+        .into_event_stream([0u8; 1024])
+        .context("failed to start inotify event stream")
+}
+
 impl CgroupWatcher {
    /// Create a new `CgroupWatcher`.
    #[tracing::instrument(skip_all, fields(%name))]
-    pub fn new(name: String) -> anyhow::Result<Self> {
+    pub fn new(
+        name: String,
+        // A channel on which to send upscale requests
+        upscale_requester: mpsc::Sender<()>,
+    ) -> anyhow::Result<(Self, impl Stream<Item = Sequenced<u64>>)> {
        // TODO: clarify exactly why we need v2
        // Make sure cgroups v2 (aka unified) are supported
        if !is_cgroup2_unified_mode() {
@@ -62,203 +245,410 @@ impl CgroupWatcher {
        }
        let cgroup = cgroups_rs::Cgroup::load(hierarchies::auto(), &name);

-        Ok(Self {
-            cgroup,
-            config: Default::default(),
-        })
+        // Start monitoring the cgroup for memory events. In general, for
+        // cgroups v2 (aka unified), metrics are reported in files like
+        // > `/sys/fs/cgroup/{name}/{metric}`
+        // We are looking for `memory.high` events, which are stored in the
+        // file `memory.events`. For more info, see the `memory.events` section
+        // of https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files
+        let path = format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name);
+        let memory_events = create_file_watcher(&path)
+            .with_context(|| format!("failed to create event watcher for {path}"))?
+            // This would be nice with with .inspect_err followed by .ok
+            .filter_map(move |_| match get_event_count(&path, MemoryEvent::High) {
+                Ok(high) => Some(high),
+                Err(error) => {
+                    // TODO: Might want to just panic here
+                    warn!(?error, "failed to read high events count from {}", &path);
+                    None
+                }
+            })
+            // Only report the event if the memory.high count increased
+            .filter_map(|high| {
+                if MEMORY_EVENT_COUNT.fetch_max(high, Ordering::AcqRel) < high {
+                    Some(high)
+                } else {
+                    None
+                }
+            })
+            .map(Sequenced::new);
+
+        let initial_count = get_event_count(
+            &format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name),
+            MemoryEvent::High,
+        )?;
+
+        info!(initial_count, "initial memory.high event count");
+
+        // Hard update `MEMORY_EVENT_COUNT` since there could have been processes
+        // running in the cgroup before that caused it to be non-zero.
+        MEMORY_EVENT_COUNT.fetch_max(initial_count, Ordering::AcqRel);
+
+        Ok((
+            Self {
+                cgroup,
+                upscale_requester,
+                last_upscale_seqnum: AtomicU64::new(0),
+                config: Default::default(),
+            },
+            memory_events,
+        ))
    }

    /// The entrypoint for the `CgroupWatcher`.
    #[tracing::instrument(skip_all)]
-    pub async fn watch(
+    pub async fn watch<E>(
        &self,
-        updates: watch::Sender<(Instant, MemoryHistory)>,
-    ) -> anyhow::Result<()> {
-        // this requirement makes the code a bit easier to work with; see the config for more.
-        assert!(self.config.memory_history_len <= self.config.memory_history_log_interval);
+        // These are ~dependency injected~ (fancy, I know) because this function
+        // should never return.
+        // -> therefore: when we tokio::spawn it, we don't await the JoinHandle.
+        // -> therefore: if we want to stick it in an Arc so many threads can access
+        //    it, methods can never take mutable access.
+        //     - note: we use the Arc strategy so that a) we can call this function
+        //             right here and b) the runner can call the set/get_memory methods
+        // -> since calling recv() on a tokio::sync::mpsc::Receiver takes &mut self,
+        //    we just pass them in here instead of holding them in fields, as that
+        //    would require this method to take &mut self.
+        mut upscales: mpsc::Receiver<Sequenced<Resources>>,
+        events: E,
+    ) -> anyhow::Result<()>
+    where
+        E: Stream<Item = Sequenced<u64>>,
+    {
+        let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
+        let mut last_memory_high_increase_at: Option<Instant> = None;
+        let mut events = pin!(events);

-        let mut ticker = tokio::time::interval(self.config.memory_poll_interval);
-        ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
-        // ticker.reset_immediately(); // FIXME: enable this once updating to tokio >= 1.30.0
+        // Are we waiting to be upscaled? Could be true if we request upscale due
+        // to a memory.high event and it does not arrive in time.
+        let mut waiting_on_upscale = false;

-        let mem_controller = self.memory()?;
+        loop {
+            tokio::select! {
+                upscale = upscales.recv() => {
+                    let Sequenced { seqnum, data } = upscale
+                        .context("failed to listen on upscale notification channel")?;
+                    waiting_on_upscale = false;
+                    last_memory_high_increase_at = None;
+                    self.last_upscale_seqnum.store(seqnum, Ordering::Release);
+                    info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
+                }
+                event = events.next() => {
+                    let Some(Sequenced { seqnum, .. }) = event else {
+                        bail!("failed to listen for memory.high events")
+                    };
+                    // The memory.high came before our last upscale, so we consider
+                    // it resolved
+                    if self.last_upscale_seqnum.fetch_max(seqnum, Ordering::AcqRel) > seqnum {
+                        info!(
+                            "received memory.high event, but it came before our last upscale -> ignoring it"
+                        );
+                        continue;
+                    }

-        // buffer for samples that will be logged. once full, it remains so.
-        let history_log_len = self.config.memory_history_log_interval;
-        let mut history_log_buf = vec![MemoryStatus::zeroed(); history_log_len];
+                    // The memory.high came after our latest upscale. We don't
+                    // want to do anything yet, so peek the next event in hopes
+                    // that it's an upscale.
+                    if let Some(upscale_num) = self
+                        .upscaled(&mut upscales)
+                        .context("failed to check if we were upscaled")?
+                    {
+                        if upscale_num > seqnum {
+                            info!(
+                                "received memory.high event, but it came before our last upscale -> ignoring it"
+                            );
+                            continue;
+                        }
+                    }

-        for t in 0_u64.. {
-            ticker.tick().await;
+                    // If it's been long enough since we last froze, freeze the
+                    // cgroup and request upscale
+                    if wait_to_freeze.is_elapsed() {
+                        info!("received memory.high event -> requesting upscale");
+                        waiting_on_upscale = self
+                            .handle_memory_high_event(&mut upscales)
+                            .await
+                            .context("failed to handle upscale")?;
+                        wait_to_freeze
+                            .as_mut()
+                            .reset(Instant::now() + self.config.do_not_freeze_more_often_than);
+                        continue;
+                    }

-            let now = Instant::now();
-            let mem = Self::memory_usage(mem_controller);
+                    // Ok, we can't freeze, just request upscale
+                    if !waiting_on_upscale {
+                        info!("received memory.high event, but too soon to refreeze -> requesting upscale");

-            let i = t as usize % history_log_len;
-            history_log_buf[i] = mem;
+                        // Make check to make sure we haven't been upscaled in the
+                        // meantine (can happen if the agent independently decides
+                        // to upscale us again)
+                        if self
+                            .upscaled(&mut upscales)
+                            .context("failed to check if we were upscaled")?
+                            .is_some()
+                        {
+                            info!("no need to request upscaling because we got upscaled");
+                            continue;
+                        }
+                        self.upscale_requester
+                            .send(())
+                            .await
+                            .context("failed to request upscale")?;
+                        waiting_on_upscale = true;
+                        continue;
+                    }

-            // We're taking *at most* memory_history_len values; we may be bounded by the total
-            // number of samples that have come in so far.
-            let samples_count = (t + 1).min(self.config.memory_history_len as u64) as usize;
-            // NB: in `ring_buf_recent_values_iter`, `i` is *inclusive*, which matches the fact
-            // that we just inserted a value there, so the end of the iterator will *include* the
-            // value at i, rather than stopping just short of it.
-            let samples = ring_buf_recent_values_iter(&history_log_buf, i, samples_count);
+                    // Shoot, we can't freeze or and we're still waiting on upscale,
+                    // increase memory.high to reduce throttling
+                    let can_increase_memory_high = match last_memory_high_increase_at {
+                        None => true,
+                        Some(t) => t.elapsed() > self.config.memory_high_increase_every,
+                    };
+                    if can_increase_memory_high {
+                        info!(
+                            "received memory.high event, \
+                            but too soon to refreeze and already requested upscale \
+                            -> increasing memory.high"
+                        );

-            let summary = MemoryHistory {
-                avg_non_reclaimable: samples.map(|h| h.non_reclaimable).sum::<u64>()
-                    / samples_count as u64,
-                samples_count,
-                samples_span: self.config.memory_poll_interval * (samples_count - 1) as u32,
+                        // Make check to make sure we haven't been upscaled in the
+                        // meantine (can happen if the agent independently decides
+                        // to upscale us again)
+                        if self
+                            .upscaled(&mut upscales)
+                            .context("failed to check if we were upscaled")?
+                            .is_some()
+                        {
+                            info!("no need to increase memory.high because got upscaled");
+                            continue;
+                        }
+
+                        // Request upscale anyways (the agent will handle deduplicating
+                        // requests)
+                        self.upscale_requester
+                            .send(())
+                            .await
+                            .context("failed to request upscale")?;
+
+                        let memory_high =
+                            self.get_memory_high_bytes().context("failed to get memory.high")?;
+                        let new_high = memory_high + self.config.memory_high_increase_by_bytes;
+                        info!(
+                            current_high_bytes = memory_high,
+                            new_high_bytes = new_high,
+                            "updating memory.high"
+                        );
+                        self.set_memory_high_bytes(new_high)
+                            .context("failed to set memory.high")?;
+                        last_memory_high_increase_at = Some(Instant::now());
+                        continue;
+                    }
+
+                    info!("received memory.high event, but can't do anything");
+                }
            };
-
-            // Log the current history if it's time to do so. Because `history_log_buf` has length
-            // equal to the logging interval, we can just log the entire buffer every time we set
-            // the last entry, which also means that for this log line, we can ignore that it's a
-            // ring buffer (because all the entries are in order of increasing time).
-            if i == history_log_len - 1 {
-                info!(
-                    history = ?MemoryStatus::debug_slice(&history_log_buf),
-                    summary = ?summary,
-                    "Recent cgroup memory statistics history"
-                );
-            }
-
-            updates
-                .send((now, summary))
-                .context("failed to send MemoryHistory")?;
        }
+    }

-        unreachable!()
+    /// Handle a `memory.high`, returning whether we are still waiting on upscale
+    /// by the time the function returns.
+    ///
+    /// The general plan for handling a `memory.high` event is as follows:
+    /// 1. Freeze the cgroup
+    /// 2. Start a timer for `self.config.max_upscale_wait`
+    /// 3. Request upscale
+    /// 4. After the timer elapses or we receive upscale, thaw the cgroup.
+    /// 5. Return whether or not we are still waiting for upscale. If we are,
+    ///    we'll increase the cgroups memory.high to avoid getting oom killed
+    #[tracing::instrument(skip_all)]
+    async fn handle_memory_high_event(
+        &self,
+        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+    ) -> anyhow::Result<bool> {
+        // Immediately freeze the cgroup before doing anything else.
+        info!("received memory.high event -> freezing cgroup");
+        self.freeze().context("failed to freeze cgroup")?;
+
+        // We'll use this for logging durations
+        let start_time = Instant::now();
+
+        // Await the upscale until we have to unfreeze
+        let timed =
+            tokio::time::timeout(self.config.max_upscale_wait, self.await_upscale(upscales));
+
+        // Request the upscale
+        info!(
+            wait = ?self.config.max_upscale_wait,
+            "sending request for immediate upscaling",
+        );
+        self.upscale_requester
+            .send(())
+            .await
+            .context("failed to request upscale")?;
+
+        let waiting_on_upscale = match timed.await {
+            Ok(Ok(())) => {
+                info!(elapsed = ?start_time.elapsed(), "received upscale in time");
+                false
+            }
+            // **important**: unfreeze the cgroup before ?-reporting the error
+            Ok(Err(e)) => {
+                info!("error waiting for upscale -> thawing cgroup");
+                self.thaw()
+                    .context("failed to thaw cgroup after errored waiting for upscale")?;
+                Err(e.context("failed to await upscale"))?
+            }
+            Err(_) => {
+                info!(elapsed = ?self.config.max_upscale_wait, "timed out waiting for upscale");
+                true
+            }
+        };
+
+        info!("thawing cgroup");
+        self.thaw().context("failed to thaw cgroup")?;
+
+        Ok(waiting_on_upscale)
+    }
+
+    /// Checks whether we were just upscaled, returning the upscale's sequence
+    /// number if so.
+    #[tracing::instrument(skip_all)]
+    fn upscaled(
+        &self,
+        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+    ) -> anyhow::Result<Option<u64>> {
+        let Sequenced { seqnum, data } = match upscales.try_recv() {
+            Ok(upscale) => upscale,
+            Err(TryRecvError::Empty) => return Ok(None),
+            Err(TryRecvError::Disconnected) => {
+                bail!("upscale notification channel was disconnected")
+            }
+        };
+
+        // Make sure to update the last upscale sequence number
+        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
+        info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
+        Ok(Some(seqnum))
+    }
+
+    /// Await an upscale event, discarding any `memory.high` events received in
+    /// the process.
+    ///
+    /// This is used in `handle_memory_high_event`, where we need to listen
+    /// for upscales in particular so we know if we can thaw the cgroup early.
+    #[tracing::instrument(skip_all)]
+    async fn await_upscale(
+        &self,
+        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+    ) -> anyhow::Result<()> {
+        let Sequenced { seqnum, .. } = upscales
+            .recv()
+            .await
+            .context("error listening for upscales")?;
+
+        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
+        Ok(())
+    }
+
+    /// Get the cgroup's name.
+    pub fn path(&self) -> &str {
+        self.cgroup.path()
+    }
+}
+
+// Methods for manipulating the actual cgroup
+impl CgroupWatcher {
+    /// Get a handle on the freezer subsystem.
+    fn freezer(&self) -> anyhow::Result<&FreezerController> {
+        if let Some(Freezer(freezer)) = self
+            .cgroup
+            .subsystems()
+            .iter()
+            .find(|sub| matches!(sub, Freezer(_)))
+        {
+            Ok(freezer)
+        } else {
+            anyhow::bail!("could not find freezer subsystem")
+        }
+    }
+
+    /// Attempt to freeze the cgroup.
+    pub fn freeze(&self) -> anyhow::Result<()> {
+        self.freezer()
+            .context("failed to get freezer subsystem")?
+            .freeze()
+            .context("failed to freeze")
+    }
+
+    /// Attempt to thaw the cgroup.
+    pub fn thaw(&self) -> anyhow::Result<()> {
+        self.freezer()
+            .context("failed to get freezer subsystem")?
+            .thaw()
+            .context("failed to thaw")
    }

    /// Get a handle on the memory subsystem.
+    ///
+    /// Note: this method does not require `self.memory_update_lock` because
+    /// getting a handle to the subsystem does not access any of the files we
+    /// care about, such as memory.high and memory.events
    fn memory(&self) -> anyhow::Result<&MemController> {
-        self.cgroup
+        if let Some(Mem(memory)) = self
+            .cgroup
            .subsystems()
            .iter()
-            .find_map(|sub| match sub {
-                Subsystem::Mem(c) => Some(c),
-                _ => None,
+            .find(|sub| matches!(sub, Mem(_)))
+        {
+            Ok(memory)
+        } else {
+            anyhow::bail!("could not find memory subsystem")
+        }
+    }
+
+    /// Get cgroup current memory usage.
+    pub fn current_memory_usage(&self) -> anyhow::Result<u64> {
+        Ok(self
+            .memory()
+            .context("failed to get memory subsystem")?
+            .memory_stat()
+            .usage_in_bytes)
+    }
+
+    /// Set cgroup memory.high threshold.
+    pub fn set_memory_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
+        self.set_memory_high_internal(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64))
+    }
+
+    /// Set the cgroup's memory.high to 'max', disabling it.
+    pub fn unset_memory_high(&self) -> anyhow::Result<()> {
+        self.set_memory_high_internal(MaxValue::Max)
+    }
+
+    fn set_memory_high_internal(&self, value: MaxValue) -> anyhow::Result<()> {
+        self.memory()
+            .context("failed to get memory subsystem")?
+            .set_mem(cgroups_rs::memory::SetMemory {
+                low: None,
+                high: Some(value),
+                min: None,
+                max: None,
            })
-            .ok_or_else(|| anyhow!("could not find memory subsystem"))
+            .map_err(anyhow::Error::from)
    }

-    /// Given a handle on the memory subsystem, returns the current memory information
-    fn memory_usage(mem_controller: &MemController) -> MemoryStatus {
-        let stat = mem_controller.memory_stat().stat;
-        MemoryStatus {
-            non_reclaimable: stat.active_anon + stat.inactive_anon,
+    /// Get memory.high threshold.
+    pub fn get_memory_high_bytes(&self) -> anyhow::Result<u64> {
+        let high = self
+            .memory()
+            .context("failed to get memory subsystem while getting memory statistics")?
+            .get_mem()
+            .map(|mem| mem.high)
+            .context("failed to get memory statistics from subsystem")?;
+        match high {
+            Some(MaxValue::Max) => Ok(i64::MAX as u64),
+            Some(MaxValue::Value(high)) => Ok(high as u64),
+            None => anyhow::bail!("failed to read memory.high from memory subsystem"),
        }
    }
 }
-
-// Helper function for `CgroupWatcher::watch`
-fn ring_buf_recent_values_iter<T>(
-    buf: &[T],
-    last_value_idx: usize,
-    count: usize,
-) -> impl '_ + Iterator<Item = &T> {
-    // Assertion carried over from `CgroupWatcher::watch`, to make the logic in this function
-    // easier (we only have to add `buf.len()` once, rather than a dynamic number of times).
-    assert!(count <= buf.len());
-
-    buf.iter()
-        // 'cycle' because the values could wrap around
-        .cycle()
-        // with 'cycle', this skip is more like 'offset', and functionally this is
-        // offsettting by 'last_value_idx - count (mod buf.len())', but we have to be
-        // careful to avoid underflow, so we pre-add buf.len().
-        // The '+ 1' is because `last_value_idx` is inclusive, rather than exclusive.
-        .skip((buf.len() + last_value_idx + 1 - count) % buf.len())
-        .take(count)
-}
-
-/// Summary of recent memory usage
-#[derive(Debug, Copy, Clone)]
-pub struct MemoryHistory {
-    /// Rolling average of non-reclaimable memory usage samples over the last `history_period`
-    pub avg_non_reclaimable: u64,
-
-    /// The number of samples used to construct this summary
-    pub samples_count: usize,
-    /// Total timespan between the first and last sample used for this summary
-    pub samples_span: Duration,
-}
-
-#[derive(Debug, Copy, Clone)]
-pub struct MemoryStatus {
-    non_reclaimable: u64,
-}
-
-impl MemoryStatus {
-    fn zeroed() -> Self {
-        MemoryStatus { non_reclaimable: 0 }
-    }
-
-    fn debug_slice(slice: &[Self]) -> impl '_ + Debug {
-        struct DS<'a>(&'a [MemoryStatus]);
-
-        impl<'a> Debug for DS<'a> {
-            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
-                f.debug_struct("[MemoryStatus]")
-                    .field(
-                        "non_reclaimable[..]",
-                        &Fields(self.0, |stat: &MemoryStatus| {
-                            BytesToGB(stat.non_reclaimable)
-                        }),
-                    )
-                    .finish()
-            }
-        }
-
-        struct Fields<'a, F>(&'a [MemoryStatus], F);
-
-        impl<'a, F: Fn(&MemoryStatus) -> T, T: Debug> Debug for Fields<'a, F> {
-            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
-                f.debug_list().entries(self.0.iter().map(&self.1)).finish()
-            }
-        }
-
-        struct BytesToGB(u64);
-
-        impl Debug for BytesToGB {
-            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
-                f.write_fmt(format_args!(
-                    "{:.3}Gi",
-                    self.0 as f64 / (1_u64 << 30) as f64
-                ))
-            }
-        }
-
-        DS(slice)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    #[test]
-    fn ring_buf_iter() {
-        let buf = vec![0_i32, 1, 2, 3, 4, 5, 6, 7, 8, 9];
-
-        let values = |offset, count| {
-            super::ring_buf_recent_values_iter(&buf, offset, count)
-                .copied()
-                .collect::<Vec<i32>>()
-        };
-
-        // Boundary conditions: start, end, and entire thing:
-        assert_eq!(values(0, 1), [0]);
-        assert_eq!(values(3, 4), [0, 1, 2, 3]);
-        assert_eq!(values(9, 4), [6, 7, 8, 9]);
-        assert_eq!(values(9, 10), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
-
-        // "normal" operation: no wraparound
-        assert_eq!(values(7, 4), [4, 5, 6, 7]);
-
-        // wraparound:
-        assert_eq!(values(0, 4), [7, 8, 9, 0]);
-        assert_eq!(values(1, 4), [8, 9, 0, 1]);
-        assert_eq!(values(2, 4), [9, 0, 1, 2]);
-        assert_eq!(values(2, 10), [3, 4, 5, 6, 7, 8, 9, 0, 1, 2]);
-    }
-}
--- a/libs/vm_monitor/src/dispatcher.rs
+++ b/libs/vm_monitor/src/dispatcher.rs
@@ -12,10 +12,12 @@ use futures::{
    stream::{SplitSink, SplitStream},
    SinkExt, StreamExt,
 };
+use tokio::sync::mpsc;
 use tracing::info;

+use crate::cgroup::Sequenced;
 use crate::protocol::{
-    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, PROTOCOL_MAX_VERSION,
+    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, Resources, PROTOCOL_MAX_VERSION,
    PROTOCOL_MIN_VERSION,
 };

@@ -34,6 +36,13 @@ pub struct Dispatcher {
    /// We send messages to the agent through `sink`
    sink: SplitSink<WebSocket, Message>,

+    /// Used to notify the cgroup when we are upscaled.
+    pub(crate) notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
+
+    /// When the cgroup requests upscale it will send on this channel. In response
+    /// we send an `UpscaleRequst` to the agent.
+    pub(crate) request_upscale_events: mpsc::Receiver<()>,
+
    /// The protocol version we have agreed to use with the agent. This is negotiated
    /// during the creation of the dispatcher, and should be the highest shared protocol
    /// version.
@@ -52,7 +61,11 @@ impl Dispatcher {
    /// 1. Wait for the agent to sent the range of protocols it supports.
    /// 2. Send a protocol version that works for us as well, or an error if there
    ///    is no compatible version.
-    pub async fn new(stream: WebSocket) -> anyhow::Result<Self> {
+    pub async fn new(
+        stream: WebSocket,
+        notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
+        request_upscale_events: mpsc::Receiver<()>,
+    ) -> anyhow::Result<Self> {
        let (mut sink, mut source) = stream.split();

        // Figure out the highest protocol version we both support
@@ -106,10 +119,22 @@ impl Dispatcher {
        Ok(Self {
            sink,
            source,
+            notify_upscale_events,
+            request_upscale_events,
            proto_version: highest_shared_version,
        })
    }

+    /// Notify the cgroup manager that we have received upscale and wait for
+    /// the acknowledgement.
+    #[tracing::instrument(skip_all, fields(?resources))]
+    pub async fn notify_upscale(&self, resources: Sequenced<Resources>) -> anyhow::Result<()> {
+        self.notify_upscale_events
+            .send(resources)
+            .await
+            .context("failed to send resources and oneshot sender across channel")
+    }
+
    /// Send a message to the agent.
    ///
    /// Although this function is small, it has one major benefit: it is the only
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -5,16 +5,18 @@
 //! all functionality.

 use std::fmt::Debug;
+use std::sync::Arc;
 use std::time::{Duration, Instant};

 use anyhow::{bail, Context};
 use axum::extract::ws::{Message, WebSocket};
 use futures::StreamExt;
-use tokio::sync::{broadcast, watch};
+use tokio::sync::broadcast;
+use tokio::sync::mpsc;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};

-use crate::cgroup::{self, CgroupWatcher};
+use crate::cgroup::{CgroupWatcher, Sequenced};
 use crate::dispatcher::Dispatcher;
 use crate::filecache::{FileCacheConfig, FileCacheState};
 use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
@@ -26,7 +28,7 @@ use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args
 pub struct Runner {
    config: Config,
    filecache: Option<FileCacheState>,
-    cgroup: Option<CgroupState>,
+    cgroup: Option<Arc<CgroupWatcher>>,
    dispatcher: Dispatcher,

    /// We "mint" new message ids by incrementing this counter and taking the value.
@@ -43,14 +45,6 @@ pub struct Runner {
    kill: broadcast::Receiver<()>,
 }

-#[derive(Debug)]
-struct CgroupState {
-    watcher: watch::Receiver<(Instant, cgroup::MemoryHistory)>,
-    /// If [`cgroup::MemoryHistory::avg_non_reclaimable`] exceeds `threshold`, we send upscale
-    /// requests.
-    threshold: u64,
-}
-
 /// Configuration for a `Runner`
 #[derive(Debug)]
 pub struct Config {
@@ -68,56 +62,16 @@ pub struct Config {
    /// upscale resource amounts (because we might not *actually* have been upscaled yet). This field
    /// should be removed once we have a better solution there.
    sys_buffer_bytes: u64,
-
-    /// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in
-    /// other words, providing a ceiling for the highest value of the threshold by enforcing that
-    /// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
-    /// threshold.
-    ///
-    /// For example, a value of `0.1` means that 10% of total memory must remain after exceeding
-    /// the threshold, so the value of the cgroup threshold would always be capped at 90% of total
-    /// memory.
-    ///
-    /// The default value of `0.15` means that we *guarantee* sending upscale requests if the
-    /// cgroup is using more than 85% of total memory (even if we're *not* separately reserving
-    /// memory for the file cache).
-    cgroup_min_overhead_fraction: f64,
-
-    cgroup_downscale_threshold_buffer_bytes: u64,
 }

 impl Default for Config {
    fn default() -> Self {
        Self {
            sys_buffer_bytes: 100 * MiB,
-            cgroup_min_overhead_fraction: 0.15,
-            cgroup_downscale_threshold_buffer_bytes: 100 * MiB,
        }
    }
 }

-impl Config {
-    fn cgroup_threshold(&self, total_mem: u64, file_cache_disk_size: u64) -> u64 {
-        // If the file cache is in tmpfs, then it will count towards shmem usage of the cgroup,
-        // and thus be non-reclaimable, so we should allow for additional memory usage.
-        //
-        // If the file cache sits on disk, our desired stable system state is for it to be fully
-        // page cached (its contents should only be paged to/from disk in situations where we can't
-        // upscale fast enough). Page-cached memory is reclaimable, so we need to lower the
-        // threshold for non-reclaimable memory so we scale up *before* the kernel starts paging
-        // out the file cache.
-        let memory_remaining_for_cgroup = total_mem.saturating_sub(file_cache_disk_size);
-
-        // Even if we're not separately making room for the file cache (if it's in tmpfs), we still
-        // want our threshold to be met gracefully instead of letting postgres get OOM-killed.
-        // So we guarantee that there's at least `cgroup_min_overhead_fraction` of total memory
-        // remaining above the threshold.
-        let max_threshold = (total_mem as f64 * (1.0 - self.cgroup_min_overhead_fraction)) as u64;
-
-        memory_remaining_for_cgroup.min(max_threshold)
-    }
-}
-
 impl Runner {
    /// Create a new monitor.
    #[tracing::instrument(skip_all, fields(?config, ?args))]
@@ -133,7 +87,12 @@ impl Runner {
            "invalid monitor Config: sys_buffer_bytes cannot be 0"
        );

-        let dispatcher = Dispatcher::new(ws)
+        // *NOTE*: the dispatcher and cgroup manager talk through these channels
+        // so make sure they each get the correct half, nothing is droppped, etc.
+        let (notified_send, notified_recv) = mpsc::channel(1);
+        let (requesting_send, requesting_recv) = mpsc::channel(1);
+
+        let dispatcher = Dispatcher::new(ws, notified_send, requesting_recv)
            .await
            .context("error creating new dispatcher")?;

@@ -147,9 +106,45 @@ impl Runner {
            kill,
        };

-        let mem = get_total_system_memory();
+        // If we have both the cgroup and file cache integrations enabled, it's possible for
+        // temporary failures to result in cgroup throttling (from memory.high), that in turn makes
+        // it near-impossible to connect to the file cache (because it times out). Unfortunately,
+        // we *do* still want to determine the file cache size before setting the cgroup's
+        // memory.high, so it's not as simple as just swapping the order.
+        //
+        // Instead, the resolution here is that on vm-monitor startup (note: happens on each
+        // connection from autoscaler-agent, possibly multiple times per compute_ctl lifecycle), we
+        // temporarily unset memory.high, to allow any existing throttling to dissipate. It's a bit
+        // of a hacky solution, but helps with reliability.
+        if let Some(name) = &args.cgroup {
+            // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
+            // now, and then set limits later.
+            info!("initializing cgroup");

-        let mut file_cache_disk_size = 0;
+            let (cgroup, cgroup_event_stream) = CgroupWatcher::new(name.clone(), requesting_send)
+                .context("failed to create cgroup manager")?;
+
+            info!("temporarily unsetting memory.high");
+
+            // Temporarily un-set cgroup memory.high; see above.
+            cgroup
+                .unset_memory_high()
+                .context("failed to unset memory.high")?;
+
+            let cgroup = Arc::new(cgroup);
+
+            let cgroup_clone = Arc::clone(&cgroup);
+            spawn_with_cancel(
+                token.clone(),
+                |_| error!("cgroup watcher terminated"),
+                async move { cgroup_clone.watch(notified_recv, cgroup_event_stream).await },
+            );
+
+            state.cgroup = Some(cgroup);
+        }
+
+        let mut file_cache_reserved_bytes = 0;
+        let mem = get_total_system_memory();

        // We need to process file cache initialization before cgroup initialization, so that the memory
        // allocated to the file cache is appropriately taken into account when we decide the cgroup's
@@ -161,7 +156,7 @@ impl Runner {
                false => FileCacheConfig::default_in_memory(),
            };

-            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
+            let mut file_cache = FileCacheState::new(connstr, config, token)
                .await
                .context("failed to create file cache")?;

@@ -186,40 +181,23 @@ impl Runner {
            if actual_size != new_size {
                info!("file cache size actually got set to {actual_size}")
            }
-
-            if args.file_cache_on_disk {
-                file_cache_disk_size = actual_size;
+            // Mark the resources given to the file cache as reserved, but only if it's in memory.
+            if !args.file_cache_on_disk {
+                file_cache_reserved_bytes = actual_size;
            }

            state.filecache = Some(file_cache);
        }

-        if let Some(name) = &args.cgroup {
-            // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
-            // now, and then set limits later.
-            info!("initializing cgroup");
+        if let Some(cgroup) = &state.cgroup {
+            let available = mem - file_cache_reserved_bytes;
+            let value = cgroup.config.calculate_memory_high_value(available);

-            let cgroup =
-                CgroupWatcher::new(name.clone()).context("failed to create cgroup manager")?;
+            info!(value, "setting memory.high");

-            let init_value = cgroup::MemoryHistory {
-                avg_non_reclaimable: 0,
-                samples_count: 0,
-                samples_span: Duration::ZERO,
-            };
-            let (hist_tx, hist_rx) = watch::channel((Instant::now(), init_value));
-
-            spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
-                cgroup.watch(hist_tx).await
-            });
-
-            let threshold = state.config.cgroup_threshold(mem, file_cache_disk_size);
-            info!(threshold, "set initial cgroup threshold",);
-
-            state.cgroup = Some(CgroupState {
-                watcher: hist_rx,
-                threshold,
-            });
+            cgroup
+                .set_memory_high_bytes(value)
+                .context("failed to set cgroup memory.high")?;
        }

        Ok(state)
@@ -239,51 +217,28 @@ impl Runner {

        let requested_mem = target.mem;
        let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
-        let (expected_file_cache_size, expected_file_cache_disk_size) = self
+        let expected_file_cache_mem_usage = self
            .filecache
            .as_ref()
-            .map(|file_cache| {
-                let size = file_cache.config.calculate_cache_size(usable_system_memory);
-                match file_cache.config.in_memory {
-                    true => (size, 0),
-                    false => (size, size),
-                }
-            })
-            .unwrap_or((0, 0));
+            .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
+            .unwrap_or(0);
+        let mut new_cgroup_mem_high = 0;
        if let Some(cgroup) = &self.cgroup {
-            let (last_time, last_history) = *cgroup.watcher.borrow();
-
-            // NB: The ordering of these conditions is intentional. During startup, we should deny
-            // downscaling until we have enough information to determine that it's safe to do so
-            // (i.e. enough samples have come in). But if it's been a while and we *still* haven't
-            // received any information, we should *fail* instead of just denying downscaling.
-            //
-            // `last_time` is set to `Instant::now()` on startup, so checking `last_time.elapsed()`
-            // serves double-duty: it trips if we haven't received *any* metrics for long enough,
-            // OR if we haven't received metrics *recently enough*.
-            //
-            // TODO: make the duration here configurable.
-            if last_time.elapsed() > Duration::from_secs(5) {
-                bail!("haven't gotten cgroup memory stats recently enough to determine downscaling information");
-            } else if last_history.samples_count <= 1 {
-                let status = "haven't received enough cgroup memory stats yet";
-                info!(status, "discontinuing downscale");
-                return Ok((false, status.to_owned()));
-            }
-
-            let new_threshold = self
+            new_cgroup_mem_high = cgroup
                .config
-                .cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);
+                .calculate_memory_high_value(usable_system_memory - expected_file_cache_mem_usage);

-            let current = last_history.avg_non_reclaimable;
+            let current = cgroup
+                .current_memory_usage()
+                .context("failed to fetch cgroup memory")?;

-            if new_threshold < current + self.config.cgroup_downscale_threshold_buffer_bytes {
+            if new_cgroup_mem_high < current + cgroup.config.memory_high_buffer_bytes {
                let status = format!(
-                    "{}: {} MiB (new threshold) < {} (current usage) + {} (downscale buffer)",
-                    "calculated memory threshold too low",
-                    bytes_to_mebibytes(new_threshold),
+                    "{}: {} MiB (new high) < {} (current usage) + {} (buffer)",
+                    "calculated memory.high too low",
+                    bytes_to_mebibytes(new_cgroup_mem_high),
                    bytes_to_mebibytes(current),
-                    bytes_to_mebibytes(self.config.cgroup_downscale_threshold_buffer_bytes)
+                    bytes_to_mebibytes(cgroup.config.memory_high_buffer_bytes)
                );

                info!(status, "discontinuing downscale");
@@ -294,14 +249,14 @@ impl Runner {

        // The downscaling has been approved. Downscale the file cache, then the cgroup.
        let mut status = vec![];
-        let mut file_cache_disk_size = 0;
+        let mut file_cache_mem_usage = 0;
        if let Some(file_cache) = &mut self.filecache {
            let actual_usage = file_cache
-                .set_file_cache_size(expected_file_cache_size)
+                .set_file_cache_size(expected_file_cache_mem_usage)
                .await
                .context("failed to set file cache size")?;
-            if !file_cache.config.in_memory {
-                file_cache_disk_size = actual_usage;
+            if file_cache.config.in_memory {
+                file_cache_mem_usage = actual_usage;
            }
            let message = format!(
                "set file cache size to {} MiB (in memory = {})",
@@ -312,18 +267,24 @@ impl Runner {
            status.push(message);
        }

-        if let Some(cgroup) = &mut self.cgroup {
-            let new_threshold = self
-                .config
-                .cgroup_threshold(usable_system_memory, file_cache_disk_size);
+        if let Some(cgroup) = &self.cgroup {
+            let available_memory = usable_system_memory - file_cache_mem_usage;
+
+            if file_cache_mem_usage != expected_file_cache_mem_usage {
+                new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
+            }
+
+            // new_cgroup_mem_high is initialized to 0 but it is guaranteed to not be here
+            // since it is properly initialized in the previous cgroup if let block
+            cgroup
+                .set_memory_high_bytes(new_cgroup_mem_high)
+                .context("failed to set cgroup memory.high")?;

            let message = format!(
-                "set cgroup memory threshold from {} MiB to {} MiB, of new total {} MiB",
-                bytes_to_mebibytes(cgroup.threshold),
-                bytes_to_mebibytes(new_threshold),
-                bytes_to_mebibytes(usable_system_memory)
+                "set cgroup memory.high to {} MiB, of new max {} MiB",
+                bytes_to_mebibytes(new_cgroup_mem_high),
+                bytes_to_mebibytes(available_memory)
            );
-            cgroup.threshold = new_threshold;
            info!("downscale: {message}");
            status.push(message);
        }
@@ -344,7 +305,8 @@ impl Runner {
        let new_mem = resources.mem;
        let usable_system_memory = new_mem.saturating_sub(self.config.sys_buffer_bytes);

-        let mut file_cache_disk_size = 0;
+        // Get the file cache's expected contribution to the memory usage
+        let mut file_cache_mem_usage = 0;
        if let Some(file_cache) = &mut self.filecache {
            let expected_usage = file_cache.config.calculate_cache_size(usable_system_memory);
            info!(
@@ -357,8 +319,8 @@ impl Runner {
                .set_file_cache_size(expected_usage)
                .await
                .context("failed to set file cache size")?;
-            if !file_cache.config.in_memory {
-                file_cache_disk_size = actual_usage;
+            if file_cache.config.in_memory {
+                file_cache_mem_usage = actual_usage;
            }

            if actual_usage != expected_usage {
@@ -370,18 +332,18 @@ impl Runner {
            }
        }

-        if let Some(cgroup) = &mut self.cgroup {
-            let new_threshold = self
-                .config
-                .cgroup_threshold(usable_system_memory, file_cache_disk_size);
-
+        if let Some(cgroup) = &self.cgroup {
+            let available_memory = usable_system_memory - file_cache_mem_usage;
+            let new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
            info!(
-                "set cgroup memory threshold from {} MiB to {} MiB of new total {} MiB",
-                bytes_to_mebibytes(cgroup.threshold),
-                bytes_to_mebibytes(new_threshold),
-                bytes_to_mebibytes(usable_system_memory)
+                target = bytes_to_mebibytes(new_cgroup_mem_high),
+                total = bytes_to_mebibytes(new_mem),
+                name = cgroup.path(),
+                "updating cgroup memory.high",
            );
-            cgroup.threshold = new_threshold;
+            cgroup
+                .set_memory_high_bytes(new_cgroup_mem_high)
+                .context("failed to set cgroup memory.high")?;
        }

        Ok(())
@@ -399,6 +361,10 @@ impl Runner {
                self.handle_upscale(granted)
                    .await
                    .context("failed to handle upscale")?;
+                self.dispatcher
+                    .notify_upscale(Sequenced::new(granted))
+                    .await
+                    .context("failed to notify notify cgroup of upscale")?;
                Ok(Some(OutboundMsg::new(
                    OutboundMsgKind::UpscaleConfirmation {},
                    id,
@@ -442,53 +408,33 @@ impl Runner {
                        Err(e) => bail!("failed to receive kill signal: {e}")
                    }
                }
-
-                // New memory stats from the cgroup, *may* need to request upscaling, if we've
-                // exceeded the threshold
-                result = self.cgroup.as_mut().unwrap().watcher.changed(), if self.cgroup.is_some() => {
-                    result.context("failed to receive from cgroup memory stats watcher")?;
-
-                    let cgroup = self.cgroup.as_ref().unwrap();
-
-                    let (_time, cgroup_mem_stat) = *cgroup.watcher.borrow();
-
-                    // If we haven't exceeded the threshold, then we're all ok
-                    if cgroup_mem_stat.avg_non_reclaimable < cgroup.threshold {
-                        continue;
+                // we need to propagate an upscale request
+                request = self.dispatcher.request_upscale_events.recv(), if self.cgroup.is_some() => {
+                    if request.is_none() {
+                        bail!("failed to listen for upscale event from cgroup")
                    }

-                    // Otherwise, we generally want upscaling. But, if it's been less than 1 second
-                    // since the last time we requested upscaling, ignore the event, to avoid
-                    // spamming the agent.
+                    // If it's been less than 1 second since the last time we requested upscaling,
+                    // ignore the event, to avoid spamming the agent (otherwise, this can happen
+                    // ~1k times per second).
                    if let Some(t) = self.last_upscale_request_at {
                        let elapsed = t.elapsed();
                        if elapsed < Duration::from_secs(1) {
-                            info!(
-                                elapsed_millis = elapsed.as_millis(),
-                                avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
-                                threshold = bytes_to_mebibytes(cgroup.threshold),
-                                "cgroup memory stats are high enough to upscale but too soon to forward the request, ignoring",
-                            );
+                            info!(elapsed_millis = elapsed.as_millis(), "cgroup asked for upscale but too soon to forward the request, ignoring");
                            continue;
                        }
                    }

                    self.last_upscale_request_at = Some(Instant::now());

-                    info!(
-                        avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
-                        threshold = bytes_to_mebibytes(cgroup.threshold),
-                        "cgroup memory stats are high enough to upscale, requesting upscale",
-                    );
-
+                    info!("cgroup asking for upscale; forwarding request");
                    self.counter += 2; // Increment, preserving parity (i.e. keep the
                                       // counter odd). See the field comment for more.
                    self.dispatcher
                        .send(OutboundMsg::new(OutboundMsgKind::UpscaleRequest {}, self.counter))
                        .await
                        .context("failed to send message")?;
-                },
-
+                }
                // there is a message from the agent
                msg = self.dispatcher.source.next() => {
                    if let Some(msg) = msg {
@@ -516,14 +462,11 @@ impl Runner {
                                    Ok(Some(out)) => out,
                                    Ok(None) => continue,
                                    Err(e) => {
-                                        // use {:#} for our logging because the display impl only
-                                        // gives the outermost cause, and the debug impl
-                                        // pretty-prints the error, whereas {:#} contains all the
-                                        // causes, but is compact (no newlines).
-                                        warn!(error = format!("{e:#}"), "error handling message");
+                                        let error = e.to_string();
+                                        warn!(?error, "error handling message");
                                        OutboundMsg::new(
                                            OutboundMsgKind::InternalError {
-                                                error: e.to_string(),
+                                                error
                                            },
                                            message.id
                                        )
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -11,7 +11,10 @@ use std::sync::{Arc, Barrier};

 use bytes::{Buf, Bytes};
 use pageserver::{
-    config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
+    config::PageServerConf,
+    repository::Key,
+    walrecord::NeonWalRecord,
+    walredo::{PostgresRedoManager, WalRedoError},
 };
 use utils::{id::TenantId, lsn::Lsn};

@@ -149,7 +152,7 @@ impl Drop for JoinOnDrop {
    }
 }

-fn execute_all<I>(input: I, manager: &PostgresRedoManager) -> anyhow::Result<()>
+fn execute_all<I>(input: I, manager: &PostgresRedoManager) -> Result<(), WalRedoError>
 where
    I: IntoIterator<Item = Request>,
 {
@@ -157,7 +160,7 @@ where
    input.into_iter().try_for_each(|req| {
        let page = req.execute(manager)?;
        assert_eq!(page.remaining(), 8192);
-        anyhow::Ok(())
+        Ok::<_, WalRedoError>(())
    })
 }

@@ -470,7 +473,7 @@ struct Request {
 }

 impl Request {
-    fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
+    fn execute(self, manager: &PostgresRedoManager) -> Result<Bytes, WalRedoError> {
        use pageserver::walredo::WalRedoManager;

        let Request {
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -355,7 +355,6 @@ fn start_pageserver(
    // consumer side) will be dropped once we can start the background jobs. Currently it is behind
    // completing all initial logical size calculations (init_logical_size_done_rx) and a timeout
    // (background_task_maximum_delay).
-    let (init_remote_done_tx, init_remote_done_rx) = utils::completion::channel();
    let (init_done_tx, init_done_rx) = utils::completion::channel();

    let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
@@ -363,8 +362,7 @@ fn start_pageserver(
    let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();

    let order = pageserver::InitializationOrder {
-        initial_tenant_load_remote: Some(init_done_tx),
-        initial_tenant_load: Some(init_remote_done_tx),
+        initial_tenant_load: Some(init_done_tx),
        initial_logical_size_can_start: init_done_rx.clone(),
        initial_logical_size_attempt: Some(init_logical_size_done_tx),
        background_jobs_can_start: background_jobs_barrier.clone(),
@@ -390,9 +388,6 @@ fn start_pageserver(
            // NOTE: unlike many futures in pageserver, this one is cancellation-safe
            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));

-            init_remote_done_rx.wait().await;
-            startup_checkpoint("initial_tenant_load_remote", "Remote part of initial load completed");
-
            init_done_rx.wait().await;
            startup_checkpoint("initial_tenant_load", "Initial load completed");
            STARTUP_IS_LOADING.set(0);
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -211,10 +211,6 @@ pub struct PageServerConf {

    /// JWT token for use with the control plane API.
    pub control_plane_api_token: Option<SecretString>,
-
-    /// If true, pageserver will make best-effort to operate without a control plane: only
-    /// for use in major incidents.
-    pub control_plane_emergency_mode: bool,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -292,7 +288,6 @@ struct PageServerConfigBuilder {

    control_plane_api: BuilderValue<Option<Url>>,
    control_plane_api_token: BuilderValue<Option<SecretString>>,
-    control_plane_emergency_mode: BuilderValue<bool>,
 }

 impl Default for PageServerConfigBuilder {
@@ -360,7 +355,6 @@ impl Default for PageServerConfigBuilder {

            control_plane_api: Set(None),
            control_plane_api_token: Set(None),
-            control_plane_emergency_mode: Set(false),
        }
    }
 }
@@ -497,10 +491,6 @@ impl PageServerConfigBuilder {
        self.control_plane_api_token = BuilderValue::Set(token)
    }

-    pub fn control_plane_emergency_mode(&mut self, enabled: bool) {
-        self.control_plane_emergency_mode = BuilderValue::Set(enabled)
-    }
-
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_size_logical_size_queries = self
            .concurrent_tenant_size_logical_size_queries
@@ -592,9 +582,6 @@ impl PageServerConfigBuilder {
            control_plane_api_token: self
                .control_plane_api_token
                .ok_or(anyhow!("missing control_plane_api_token"))?,
-            control_plane_emergency_mode: self
-                .control_plane_emergency_mode
-                .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
        })
    }
 }
@@ -820,10 +807,6 @@ impl PageServerConf {
                        builder.control_plane_api_token(Some(parsed.into()))
                    }
                },
-                "control_plane_emergency_mode" => {
-                    builder.control_plane_emergency_mode(parse_toml_bool(key, item)?)
-
-                },
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -993,7 +976,6 @@ impl PageServerConf {
            background_task_maximum_delay: Duration::ZERO,
            control_plane_api: None,
            control_plane_api_token: None,
-            control_plane_emergency_mode: false,
        }
    }
 }
@@ -1217,8 +1199,7 @@ background_task_maximum_delay = '334 s'
                    defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
                )?,
                control_plane_api: None,
-                control_plane_api_token: None,
-                control_plane_emergency_mode: false
+                control_plane_api_token: None
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1274,8 +1255,7 @@ background_task_maximum_delay = '334 s'
                ondemand_download_behavior_treat_error_as_warn: false,
                background_task_maximum_delay: Duration::from_secs(334),
                control_plane_api: None,
-                control_plane_api_token: None,
-                control_plane_emergency_mode: false
+                control_plane_api_token: None
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -2,7 +2,6 @@
 //! and push them to a HTTP endpoint.
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
@@ -144,7 +143,7 @@ pub async fn collect_metrics(
        crate::tenant::tasks::warn_when_period_overrun(
            tick_at.elapsed(),
            metric_collection_interval,
-            BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
+            "consumption_metrics_collect_metrics",
        );
    }
 }
@@ -269,11 +268,6 @@ async fn calculate_synthetic_size_worker(
            }

            if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
-                // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
-                // We can put in some prioritization for consumption metrics.
-                // Same for the loop that fetches computed metrics.
-                // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
-                // which turns out is really handy to understand the system.
                if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
                    error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
                }
@@ -283,7 +277,7 @@ async fn calculate_synthetic_size_worker(
        crate::tenant::tasks::warn_when_period_overrun(
            tick_at.elapsed(),
            synthetic_size_calculation_interval,
-            BackgroundLoopKind::ConsumptionMetricsSyntheticSizeWorker,
+            "consumption_metrics_synthetic_size_worker",
        );
    }
 }
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -40,6 +40,7 @@ use validator::ValidatorQueueMessage;

 use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};

+// TODO: adminstrative "panic button" config property to disable all deletions
 // TODO: configurable for how long to wait before executing deletions

 /// We aggregate object deletions from many tenants in one place, for several reasons:
@@ -153,7 +154,7 @@ impl FlushOp {

 #[derive(Clone, Debug)]
 pub struct DeletionQueueClient {
-    tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
+    tx: tokio::sync::mpsc::Sender<ListWriterQueueMessage>,
    executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,

    lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
@@ -185,7 +186,7 @@ where
    V: Serialize,
    I: AsRef<[u8]>,
 {
-    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v));
+    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));

    transformed
        .collect::<HashMap<String, &V>>()
@@ -212,7 +213,7 @@ where

 /// Files ending with this suffix will be ignored and erased
 /// during recovery as startup.
-const TEMP_SUFFIX: &str = "tmp";
+const TEMP_SUFFIX: &str = ".tmp";

 #[serde_as]
 #[derive(Debug, Serialize, Deserialize)]
@@ -324,7 +325,10 @@ impl DeletionList {
            return false;
        }

-        let timeline_entry = tenant_entry.timelines.entry(*timeline).or_default();
+        let timeline_entry = tenant_entry
+            .timelines
+            .entry(*timeline)
+            .or_insert_with(Vec::new);

        let timeline_remote_path = remote_timeline_path(tenant, timeline);

@@ -416,7 +420,7 @@ pub enum DeletionQueueError {
 impl DeletionQueueClient {
    pub(crate) fn broken() -> Self {
        // Channels whose receivers are immediately dropped.
-        let (tx, _rx) = tokio::sync::mpsc::unbounded_channel();
+        let (tx, _rx) = tokio::sync::mpsc::channel(1);
        let (executor_tx, _executor_rx) = tokio::sync::mpsc::channel(1);
        Self {
            tx,
@@ -428,12 +432,12 @@ impl DeletionQueueClient {
    /// This is cancel-safe.  If you drop the future before it completes, the message
    /// is not pushed, although in the context of the deletion queue it doesn't matter: once
    /// we decide to do a deletion the decision is always final.
-    fn do_push<T>(
+    async fn do_push<T>(
        &self,
-        queue: &tokio::sync::mpsc::UnboundedSender<T>,
+        queue: &tokio::sync::mpsc::Sender<T>,
        msg: T,
    ) -> Result<(), DeletionQueueError> {
-        match queue.send(msg) {
+        match queue.send(msg).await {
            Ok(_) => Ok(()),
            Err(e) => {
                // This shouldn't happen, we should shut down all tenants before
@@ -445,7 +449,7 @@ impl DeletionQueueClient {
        }
    }

-    pub(crate) fn recover(
+    pub(crate) async fn recover(
        &self,
        attached_tenants: HashMap<TenantId, Generation>,
    ) -> Result<(), DeletionQueueError> {
@@ -453,6 +457,7 @@ impl DeletionQueueClient {
            &self.tx,
            ListWriterQueueMessage::Recover(RecoverOp { attached_tenants }),
        )
+        .await
    }

    /// When a Timeline wishes to update the remote_consistent_lsn that it exposes to the outside
@@ -525,21 +530,6 @@ impl DeletionQueueClient {
            return self.flush_immediate().await;
        }

-        self.push_layers_sync(tenant_id, timeline_id, current_generation, layers)
-    }
-
-    /// When a Tenant has a generation, push_layers is always synchronous because
-    /// the ListValidator channel is an unbounded channel.
-    ///
-    /// This can be merged into push_layers when we remove the Generation-less mode
-    /// support (`<https://github.com/neondatabase/neon/issues/5395>`)
-    pub(crate) fn push_layers_sync(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        current_generation: Generation,
-        layers: Vec<(LayerFileName, Generation)>,
-    ) -> Result<(), DeletionQueueError> {
        metrics::DELETION_QUEUE
            .keys_submitted
            .inc_by(layers.len() as u64);
@@ -553,16 +543,17 @@ impl DeletionQueueClient {
                objects: Vec::new(),
            }),
        )
+        .await
    }

    /// This is cancel-safe.  If you drop the future the flush may still happen in the background.
    async fn do_flush<T>(
        &self,
-        queue: &tokio::sync::mpsc::UnboundedSender<T>,
+        queue: &tokio::sync::mpsc::Sender<T>,
        msg: T,
        rx: tokio::sync::oneshot::Receiver<()>,
    ) -> Result<(), DeletionQueueError> {
-        self.do_push(queue, msg)?;
+        self.do_push(queue, msg).await?;
        if rx.await.is_err() {
            // This shouldn't happen if tenants are shut down before deletion queue.  If we
            // encounter a bug like this, then a flusher will incorrectly believe it has flushed
@@ -583,18 +574,6 @@ impl DeletionQueueClient {
            .await
    }

-    /// Issue a flush without waiting for it to complete.  This is useful on advisory flushes where
-    /// the caller wants to avoid the risk of waiting for lots of enqueued work, such as on tenant
-    /// detach where flushing is nice but not necessary.
-    ///
-    /// This function provides no guarantees of work being done.
-    pub fn flush_advisory(&self) {
-        let (flush_op, _) = FlushOp::new();
-
-        // Transmit the flush message, ignoring any result (such as a closed channel during shutdown).
-        drop(self.tx.send(ListWriterQueueMessage::FlushExecute(flush_op)));
-    }
-
    // Wait until all previous deletions are executed
    pub(crate) async fn flush_execute(&self) -> Result<(), DeletionQueueError> {
        debug!("flush_execute: flushing to deletion lists...");
@@ -611,7 +590,9 @@ impl DeletionQueueClient {
        // Flush any immediate-mode deletions (the above backend flush will only flush
        // the executor if deletions had flowed through the backend)
        debug!("flush_execute: flushing execution...");
-        self.flush_immediate().await?;
+        let (flush_op, rx) = FlushOp::new();
+        self.do_flush(&self.executor_tx, DeleterMessage::Flush(flush_op), rx)
+            .await?;
        debug!("flush_execute: finished flushing execution...");
        Ok(())
    }
@@ -666,10 +647,8 @@ impl DeletionQueue {
    where
        C: ControlPlaneGenerationsApi + Send + Sync,
    {
-        // Unbounded channel: enables non-async functions to submit deletions.  The actual length is
-        // constrained by how promptly the ListWriter wakes up and drains it, which should be frequent
-        // enough to avoid this taking pathologically large amount of memory.
-        let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
+        // Deep channel: it consumes deletions from all timelines and we do not want to block them
+        let (tx, rx) = tokio::sync::mpsc::channel(16384);

        // Shallow channel: it carries DeletionLists which each contain up to thousands of deletions
        let (backend_tx, backend_rx) = tokio::sync::mpsc::channel(16);
@@ -982,7 +961,7 @@ mod test {
        // Basic test that the deletion queue processes the deletions we pass into it
        let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::new())?;
+        client.recover(HashMap::new()).await?;

        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
        let tenant_id = ctx.harness.tenant_id;
@@ -1050,7 +1029,7 @@ mod test {
    async fn deletion_queue_validation() -> anyhow::Result<()> {
        let ctx = setup("deletion_queue_validation").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::new())?;
+        client.recover(HashMap::new()).await?;

        // Generation that the control plane thinks is current
        let latest_generation = Generation::new(0xdeadbeef);
@@ -1107,7 +1086,7 @@ mod test {
        // Basic test that the deletion queue processes the deletions we pass into it
        let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::new())?;
+        client.recover(HashMap::new()).await?;

        let tenant_id = ctx.harness.tenant_id;

@@ -1170,7 +1149,9 @@ mod test {
        drop(client);
        ctx.restart().await;
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::from([(tenant_id, now_generation)]))?;
+        client
+            .recover(HashMap::from([(tenant_id, now_generation)]))
+            .await?;

        info!("Flush-executing");
        client.flush_execute().await?;
@@ -1196,7 +1177,7 @@ pub(crate) mod mock {
    };

    pub struct ConsumerState {
-        rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
+        rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
        executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
    }

@@ -1273,7 +1254,7 @@ pub(crate) mod mock {
    }

    pub struct MockDeletionQueue {
-        tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
+        tx: tokio::sync::mpsc::Sender<ListWriterQueueMessage>,
        executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
        executed: Arc<AtomicUsize>,
        remote_storage: Option<GenericRemoteStorage>,
@@ -1283,7 +1264,7 @@ pub(crate) mod mock {

    impl MockDeletionQueue {
        pub fn new(remote_storage: Option<GenericRemoteStorage>) -> Self {
-            let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
+            let (tx, rx) = tokio::sync::mpsc::channel(16384);
            let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16384);

            let executed = Arc::new(AtomicUsize::new(0));
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -13,7 +13,6 @@ use std::time::Duration;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 use tracing::warn;
-use utils::backoff;

 use crate::metrics;

@@ -64,19 +63,7 @@ impl Deleter {
            Err(anyhow::anyhow!("failpoint hit"))
        });

-        // A backoff::retry is used here for two reasons:
-        // - To provide a backoff rather than busy-polling the API on errors
-        // - To absorb transient 429/503 conditions without hitting our error
-        //   logging path for issues deleting objects.
-        backoff::retry(
-            || async { self.remote_storage.delete_objects(&self.accumulator).await },
-            |_| false,
-            3,
-            10,
-            "executing deletion batch",
-            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Shutting down")),
-        )
-        .await
+        self.remote_storage.delete_objects(&self.accumulator).await
    }

    /// Block until everything in accumulator has been executed
@@ -101,10 +88,7 @@ impl Deleter {
                    self.accumulator.clear();
                }
                Err(e) => {
-                    if self.cancel.is_cancelled() {
-                        return Err(DeletionQueueError::ShuttingDown);
-                    }
-                    warn!("DeleteObjects request failed: {e:#}, will continue trying");
+                    warn!("DeleteObjects request failed: {e:#}, will retry");
                    metrics::DELETION_QUEUE
                        .remote_errors
                        .with_label_values(&["execute"])
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -85,7 +85,7 @@ pub(super) struct ListWriter {
    conf: &'static PageServerConf,

    // Incoming frontend requests to delete some keys
-    rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
+    rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,

    // Outbound requests to the backend to execute deletion lists we have composed.
    tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
@@ -111,7 +111,7 @@ impl ListWriter {

    pub(super) fn new(
        conf: &'static PageServerConf,
-        rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
+        rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
        tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
        cancel: CancellationToken,
    ) -> Self {
@@ -230,7 +230,6 @@ impl ListWriter {
        let list_name_pattern =
            Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();

-        let temp_extension = format!(".{TEMP_SUFFIX}");
        let header_path = self.conf.deletion_header_path();
        let mut seqs: Vec<u64> = Vec::new();
        while let Some(dentry) = dir.next_entry().await? {
@@ -242,7 +241,7 @@ impl ListWriter {
                continue;
            }

-            if dentry_str.ends_with(&temp_extension) {
+            if dentry_str.ends_with(TEMP_SUFFIX) {
                info!("Cleaning up temporary file {dentry_str}");
                let absolute_path =
                    deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -411,11 +411,6 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                                evictions_failed.file_sizes += file_size;
                                evictions_failed.count += 1;
                            }
-                            Some(Err(EvictionError::MetadataInconsistency(detail))) => {
-                                warn!(%layer, "failed to evict layer: {detail}");
-                                evictions_failed.file_sizes += file_size;
-                                evictions_failed.count += 1;
-                            }
                            None => {
                                assert!(cancel.is_cancelled());
                                return;
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -93,16 +93,9 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
    delete:
      description: |
-        Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
+        Attempts to delete specified tenant. 500 and 409 errors should be retried until 404 is retrieved.
        404 means that deletion successfully finished"
      responses:
        "400":
@@ -141,13 +134,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-

  /v1/tenant/{tenant_id}/timeline:
    parameters:
@@ -192,13 +178,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-

  /v1/tenant/{tenant_id}/timeline/{timeline_id}:
    parameters:
@@ -247,13 +226,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
    delete:
      description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
      responses:
@@ -293,19 +265,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/PreconditionFailedError"
+
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-

  /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
    parameters:
@@ -362,13 +328,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
  /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
    parameters:
      - name: tenant_id
@@ -416,13 +375,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
  /v1/tenant/{tenant_id}/attach:
    parameters:
      - name: tenant_id
@@ -513,13 +465,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-

  /v1/tenant/{tenant_id}/detach:
    parameters:
@@ -573,13 +518,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-

  /v1/tenant/{tenant_id}/ignore:
    parameters:
@@ -622,13 +560,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-

  /v1/tenant/{tenant_id}/load:
    parameters:
@@ -673,13 +604,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-

  /v1/tenant/{tenant_id}/synthetic_size:
    parameters:
@@ -717,12 +641,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/{tenant_id}/size:
    parameters:
@@ -786,13 +704,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-

  /v1/tenant/{tenant_id}/timeline/:
    parameters:
@@ -869,13 +780,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
  /v1/tenant/:
    get:
      description: Get tenants list
@@ -906,13 +810,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
    post:
      description: |
        Create a tenant. Returns new tenant id on success.
@@ -963,13 +860,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-

  /v1/tenant/config:
    put:
@@ -1015,13 +905,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
  /v1/tenant/{tenant_id}/config/:
    parameters:
      - name: tenant_id
@@ -1071,13 +954,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
 components:
  securitySchemes:
    JWT:
@@ -1344,13 +1220,6 @@ components:
      properties:
        msg:
          type: string
-    ServiceUnavailableError:
-      type: object
-      required:
-        - msg
-      properties:
-        msg:
-          type: string
    NotFoundError:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -6,7 +6,6 @@ use std::sync::Arc;

 use anyhow::{anyhow, Context, Result};
 use futures::TryFutureExt;
-use hyper::header::CONTENT_TYPE;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
@@ -77,7 +76,7 @@ impl State {
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
        deletion_queue_client: DeletionQueueClient,
    ) -> anyhow::Result<Self> {
-        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"]
+        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
            .iter()
            .map(|v| v.parse().unwrap())
            .collect::<Vec<_>>();
@@ -134,9 +133,11 @@ impl From<PageReconstructError> for ApiError {
                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
            }
            PageReconstructError::AncestorStopping(_) => {
-                ApiError::ResourceUnavailable(format!("{pre}").into())
+                ApiError::ResourceUnavailable(format!("{pre}"))
+            }
+            PageReconstructError::WalRedo(pre) => {
+                ApiError::InternalServerError(anyhow::Error::new(pre))
            }
-            PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
        }
    }
 }
@@ -145,7 +146,7 @@ impl From<TenantMapInsertError> for ApiError {
    fn from(tmie: TenantMapInsertError) -> ApiError {
        match tmie {
            TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
-                ApiError::ResourceUnavailable(format!("{tmie}").into())
+                ApiError::ResourceUnavailable(format!("{tmie}"))
            }
            TenantMapInsertError::TenantAlreadyExists(id, state) => {
                ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
@@ -162,6 +163,9 @@ impl From<TenantStateError> for ApiError {
    fn from(tse: TenantStateError) -> ApiError {
        match tse {
            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
+            TenantStateError::NotActive(_) => {
+                ApiError::ResourceUnavailable("Tenant not yet active".into())
+            }
            TenantStateError::IsStopping(_) => {
                ApiError::ResourceUnavailable("Tenant is stopping".into())
            }
@@ -391,9 +395,6 @@ async fn timeline_create_handler(
                    format!("{err:#}")
                ))
            }
-            Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
-                json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
-            }
            Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
        }
    }
@@ -570,14 +571,9 @@ async fn tenant_detach_handler(

    let state = get_state(&request);
    let conf = state.conf;
-    mgr::detach_tenant(
-        conf,
-        tenant_id,
-        detach_ignored.unwrap_or(false),
-        &state.deletion_queue_client,
-    )
-    .instrument(info_span!("tenant_detach", %tenant_id))
-    .await?;
+    mgr::detach_tenant(conf, tenant_id, detach_ignored.unwrap_or(false))
+        .instrument(info_span!("tenant_detach", %tenant_id))
+        .await?;

    json_response(StatusCode::OK, ())
 }
@@ -640,7 +636,7 @@ async fn tenant_list_handler(
        .instrument(info_span!("tenant_list"))
        .await
        .map_err(|_| {
-            ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
+            ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".to_string())
        })?
        .iter()
        .map(|(id, state)| TenantInfo {
@@ -1034,7 +1030,7 @@ async fn put_tenant_location_config_handler(
    // The `Detached` state is special, it doesn't upsert a tenant, it removes
    // its local disk content and drops it from memory.
    if let LocationConfigMode::Detached = request_data.config.mode {
-        mgr::detach_tenant(conf, tenant_id, true, &state.deletion_queue_client)
+        mgr::detach_tenant(conf, tenant_id, true)
            .instrument(info_span!("tenant_detach", %tenant_id))
            .await?;
        return json_response(StatusCode::OK, ());
@@ -1240,136 +1236,6 @@ async fn deletion_queue_flush(
    }
 }

-/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
-async fn getpage_at_lsn_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_id))?;
-
-    struct Key(crate::repository::Key);
-
-    impl std::str::FromStr for Key {
-        type Err = anyhow::Error;
-
-        fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
-            crate::repository::Key::from_hex(s).map(Key)
-        }
-    }
-
-    let key: Key = parse_query_param(&request, "key")?
-        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'key' query parameter")))?;
-    let lsn: Lsn = parse_query_param(&request, "lsn")?
-        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
-
-    async {
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-
-        let page = timeline.get(key.0, lsn, &ctx).await?;
-
-        Result::<_, ApiError>::Ok(
-            Response::builder()
-                .status(StatusCode::OK)
-                .header(CONTENT_TYPE, "application/octet-stream")
-                .body(hyper::Body::from(page))
-                .unwrap(),
-        )
-    }
-    .instrument(info_span!("timeline_get", %tenant_id, %timeline_id))
-    .await
-}
-
-async fn timeline_collect_keyspace(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_id))?;
-
-    struct Partitioning {
-        keys: crate::keyspace::KeySpace,
-
-        at_lsn: Lsn,
-    }
-
-    impl serde::Serialize for Partitioning {
-        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            use serde::ser::SerializeMap;
-            let mut map = serializer.serialize_map(Some(2))?;
-            map.serialize_key("keys")?;
-            map.serialize_value(&KeySpace(&self.keys))?;
-            map.serialize_key("at_lsn")?;
-            map.serialize_value(&WithDisplay(&self.at_lsn))?;
-            map.end()
-        }
-    }
-
-    struct WithDisplay<'a, T>(&'a T);
-
-    impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
-        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            serializer.collect_str(&self.0)
-        }
-    }
-
-    struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
-
-    impl<'a> serde::Serialize for KeySpace<'a> {
-        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            use serde::ser::SerializeSeq;
-            let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
-            for kr in &self.0.ranges {
-                seq.serialize_element(&KeyRange(kr))?;
-            }
-            seq.end()
-        }
-    }
-
-    struct KeyRange<'a>(&'a std::ops::Range<crate::repository::Key>);
-
-    impl<'a> serde::Serialize for KeyRange<'a> {
-        fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            use serde::ser::SerializeTuple;
-            let mut t = serializer.serialize_tuple(2)?;
-            t.serialize_element(&WithDisplay(&self.0.start))?;
-            t.serialize_element(&WithDisplay(&self.0.end))?;
-            t.end()
-        }
-    }
-
-    let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;
-
-    async {
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-        let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
-        let keys = timeline
-            .collect_keyspace(at_lsn, &ctx)
-            .await
-            .map_err(ApiError::InternalServerError)?;
-
-        json_response(StatusCode::OK, Partitioning { keys, at_lsn })
-    }
-    .instrument(info_span!("timeline_collect_keyspace", %tenant_id, %timeline_id))
-    .await
-}
-
 async fn active_timeline_of_active_tenant(
    tenant_id: TenantId,
    timeline_id: TimelineId,
@@ -1717,12 +1583,5 @@ pub fn make_router(
        .post("/v1/tracing/event", |r| {
            testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
        })
-        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/getpage", |r| {
-            testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler)
-        })
-        .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/keyspace",
-            |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
-        )
        .any(handler_404))
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -173,9 +173,6 @@ fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
 /// delaying is needed.
 #[derive(Clone)]
 pub struct InitializationOrder {
-    /// Each initial tenant load task carries this until it is done loading timelines from remote storage
-    pub initial_tenant_load_remote: Option<utils::completion::Completion>,
-
    /// Each initial tenant load task carries this until completion.
    pub initial_tenant_load: Option<utils::completion::Completion>,

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -691,9 +691,10 @@ impl StorageIoTime {
        .expect("failed to define a metric");
        let metrics = std::array::from_fn(|i| {
            let op = StorageIoOperation::from_repr(i).unwrap();
-            storage_io_histogram_vec
+            let metric = storage_io_histogram_vec
                .get_metric_with_label_values(&[op.as_str()])
-                .unwrap()
+                .unwrap();
+            metric
        });
        Self { metrics }
    }
@@ -1067,26 +1068,6 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

-pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy<IntCounterVec> =
-    Lazy::new(|| {
-        register_int_counter_vec!(
-            "pageserver_background_loop_semaphore_wait_start_count",
-            "Counter for background loop concurrency-limiting semaphore acquire calls started",
-            &["task"],
-        )
-        .unwrap()
-    });
-
-pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy<IntCounterVec> =
-    Lazy::new(|| {
-        register_int_counter_vec!(
-            "pageserver_background_loop_semaphore_wait_finish_count",
-            "Counter for background loop concurrency-limiting semaphore acquire calls finished",
-            &["task"],
-        )
-        .unwrap()
-    });
-
 pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_background_loop_period_overrun_count",
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -66,7 +66,8 @@
 //! inserted to the mapping, but you must hold the write-lock on the slot until
 //! the contents are valid. If you need to release the lock without initializing
 //! the contents, you must remove the mapping first. We make that easy for the
-//! callers with PageWriteGuard: the caller must explicitly call guard.mark_valid() after it has
+//! callers with PageWriteGuard: when lock_for_write() returns an uninitialized
+//! page, the caller must explicitly call guard.mark_valid() after it has
 //! initialized it. If the guard is dropped without calling mark_valid(), the
 //! mapping is automatically removed and the slot is marked free.
 //!
@@ -285,25 +286,23 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
 ///
 /// Counterintuitively, this is used even for a read, if the requested page is not
 /// currently found in the page cache. In that case, the caller of lock_for_read()
-/// is expected to fill in the page contents and call mark_valid().
+/// is expected to fill in the page contents and call mark_valid(). Similarly
+/// lock_for_write() can return an invalid buffer that the caller is expected to
+/// to initialize.
+///
 pub struct PageWriteGuard<'i> {
-    state: PageWriteGuardState<'i>,
-}
+    inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,

-enum PageWriteGuardState<'i> {
-    Invalid {
-        inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
-        _permit: PinnedSlotsPermit,
-    },
-    Downgraded,
+    _permit: PinnedSlotsPermit,
+
+    // Are the page contents currently valid?
+    // Used to mark pages as invalid that are assigned but not yet filled with data.
+    valid: bool,
 }

 impl std::ops::DerefMut for PageWriteGuard<'_> {
    fn deref_mut(&mut self) -> &mut Self::Target {
-        match &mut self.state {
-            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
-            PageWriteGuardState::Downgraded => unreachable!(),
-        }
+        self.inner.buf
    }
 }

@@ -311,37 +310,25 @@ impl std::ops::Deref for PageWriteGuard<'_> {
    type Target = [u8; PAGE_SZ];

    fn deref(&self) -> &Self::Target {
-        match &self.state {
-            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
-            PageWriteGuardState::Downgraded => unreachable!(),
-        }
+        self.inner.buf
    }
 }

 impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_> {
    fn as_mut(&mut self) -> &mut [u8; PAGE_SZ] {
-        match &mut self.state {
-            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
-            PageWriteGuardState::Downgraded => unreachable!(),
-        }
+        self.inner.buf
    }
 }

-impl<'a> PageWriteGuard<'a> {
+impl PageWriteGuard<'_> {
    /// Mark that the buffer contents are now valid.
-    #[must_use]
-    pub fn mark_valid(mut self) -> PageReadGuard<'a> {
-        let prev = std::mem::replace(&mut self.state, PageWriteGuardState::Downgraded);
-        match prev {
-            PageWriteGuardState::Invalid { inner, _permit } => {
-                assert!(inner.key.is_some());
-                PageReadGuard {
-                    _permit: Arc::new(_permit),
-                    slot_guard: inner.downgrade(),
-                }
-            }
-            PageWriteGuardState::Downgraded => unreachable!(),
-        }
+    pub fn mark_valid(&mut self) {
+        assert!(self.inner.key.is_some());
+        assert!(
+            !self.valid,
+            "mark_valid called on a buffer that was already valid"
+        );
+        self.valid = true;
    }
 }

@@ -352,14 +339,11 @@ impl Drop for PageWriteGuard<'_> {
    /// initializing it, remove the mapping from the page cache.
    ///
    fn drop(&mut self) {
-        match &mut self.state {
-            PageWriteGuardState::Invalid { inner, _permit } => {
-                assert!(inner.key.is_some());
-                let self_key = inner.key.as_ref().unwrap();
-                PAGE_CACHE.get().unwrap().remove_mapping(self_key);
-                inner.key = None;
-            }
-            PageWriteGuardState::Downgraded => {}
+        assert!(self.inner.key.is_some());
+        if !self.valid {
+            let self_key = self.inner.key.as_ref().unwrap();
+            PAGE_CACHE.get().unwrap().remove_mapping(self_key);
+            self.inner.key = None;
        }
    }
 }
@@ -370,6 +354,12 @@ pub enum ReadBufResult<'a> {
    NotFound(PageWriteGuard<'a>),
 }

+/// lock_for_write() return value
+pub enum WriteBufResult<'a> {
+    Found(PageWriteGuard<'a>),
+    NotFound(PageWriteGuard<'a>),
+}
+
 impl PageCache {
    //
    // Section 1.1: Public interface functions for looking up and memorizing materialized page
@@ -456,77 +446,20 @@ impl PageCache {
            lsn,
        };

-        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
-        loop {
-            // First check if the key already exists in the cache.
-            if let Some(slot_idx) = self.search_mapping_exact(&cache_key) {
-                // The page was found in the mapping. Lock the slot, and re-check
-                // that it's still what we expected (because we don't released the mapping
-                // lock already, another thread could have evicted the page)
-                let slot = &self.slots[slot_idx];
-                let inner = slot.inner.write().await;
-                if inner.key.as_ref() == Some(&cache_key) {
-                    slot.inc_usage_count();
-                    debug_assert!(
-                        {
-                            let guard = inner.permit.lock().unwrap();
-                            guard.upgrade().is_none()
-                        },
-                        "we hold a write lock, so, no one else should have a permit"
-                    );
-                    debug_assert_eq!(inner.buf.len(), img.len());
-                    // We already had it in cache. Another thread must've put it there
-                    // concurrently. Check that it had the same contents that we
-                    // replayed.
-                    assert!(inner.buf == img);
-                    return Ok(());
-                }
+        match self.lock_for_write(&cache_key).await? {
+            WriteBufResult::Found(write_guard) => {
+                // We already had it in cache. Another thread must've put it there
+                // concurrently. Check that it had the same contents that we
+                // replayed.
+                assert!(*write_guard == img);
            }
-            debug_assert!(permit.is_some());
-
-            // Not found. Find a victim buffer
-            let (slot_idx, mut inner) = self
-                .find_victim(permit.as_ref().unwrap())
-                .await
-                .context("Failed to find evict victim")?;
-
-            // Insert mapping for this. At this point, we may find that another
-            // thread did the same thing concurrently. In that case, we evicted
-            // our victim buffer unnecessarily. Put it into the free list and
-            // continue with the slot that the other thread chose.
-            if let Some(_existing_slot_idx) = self.try_insert_mapping(&cache_key, slot_idx) {
-                // TODO: put to free list
-
-                // We now just loop back to start from beginning. This is not
-                // optimal, we'll perform the lookup in the mapping again, which
-                // is not really necessary because we already got
-                // 'existing_slot_idx'.  But this shouldn't happen often enough
-                // to matter much.
-                continue;
+            WriteBufResult::NotFound(mut write_guard) => {
+                write_guard.copy_from_slice(img);
+                write_guard.mark_valid();
            }
-
-            // Make the slot ready
-            let slot = &self.slots[slot_idx];
-            inner.key = Some(cache_key.clone());
-            slot.set_usage_count(1);
-            // Create a write guard for the slot so we go through the expected motions.
-            debug_assert!(
-                {
-                    let guard = inner.permit.lock().unwrap();
-                    guard.upgrade().is_none()
-                },
-                "we hold a write lock, so, no one else should have a permit"
-            );
-            let mut write_guard = PageWriteGuard {
-                state: PageWriteGuardState::Invalid {
-                    _permit: permit.take().unwrap(),
-                    inner,
-                },
-            };
-            write_guard.copy_from_slice(img);
-            let _ = write_guard.mark_valid();
-            return Ok(());
        }
+
+        Ok(())
    }

    // Section 1.2: Public interface functions for working with immutable file pages.
@@ -705,10 +638,99 @@ impl PageCache {
            );

            return Ok(ReadBufResult::NotFound(PageWriteGuard {
-                state: PageWriteGuardState::Invalid {
+                _permit: permit.take().unwrap(),
+                inner,
+                valid: false,
+            }));
+        }
+    }
+
+    /// Look up a page in the cache and lock it in write mode. If it's not
+    /// found, returns None.
+    ///
+    /// When locking a page for writing, the search criteria is always "exact".
+    async fn try_lock_for_write(
+        &self,
+        cache_key: &CacheKey,
+        permit: &mut Option<PinnedSlotsPermit>,
+    ) -> Option<PageWriteGuard> {
+        if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
+            // The page was found in the mapping. Lock the slot, and re-check
+            // that it's still what we expected (because we don't released the mapping
+            // lock already, another thread could have evicted the page)
+            let slot = &self.slots[slot_idx];
+            let inner = slot.inner.write().await;
+            if inner.key.as_ref() == Some(cache_key) {
+                slot.inc_usage_count();
+                debug_assert!(
+                    {
+                        let guard = inner.permit.lock().unwrap();
+                        guard.upgrade().is_none()
+                    },
+                    "we hold a write lock, so, no one else should have a permit"
+                );
+                return Some(PageWriteGuard {
                    _permit: permit.take().unwrap(),
                    inner,
+                    valid: true,
+                });
+            }
+        }
+        None
+    }
+
+    /// Return a write-locked buffer for given block.
+    ///
+    /// Similar to lock_for_read(), but the returned buffer is write-locked and
+    /// may be modified by the caller even if it's already found in the cache.
+    async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
+        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
+        loop {
+            // First check if the key already exists in the cache.
+            if let Some(write_guard) = self.try_lock_for_write(cache_key, &mut permit).await {
+                debug_assert!(permit.is_none());
+                return Ok(WriteBufResult::Found(write_guard));
+            }
+            debug_assert!(permit.is_some());
+
+            // Not found. Find a victim buffer
+            let (slot_idx, mut inner) = self
+                .find_victim(permit.as_ref().unwrap())
+                .await
+                .context("Failed to find evict victim")?;
+
+            // Insert mapping for this. At this point, we may find that another
+            // thread did the same thing concurrently. In that case, we evicted
+            // our victim buffer unnecessarily. Put it into the free list and
+            // continue with the slot that the other thread chose.
+            if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
+                // TODO: put to free list
+
+                // We now just loop back to start from beginning. This is not
+                // optimal, we'll perform the lookup in the mapping again, which
+                // is not really necessary because we already got
+                // 'existing_slot_idx'.  But this shouldn't happen often enough
+                // to matter much.
+                continue;
+            }
+
+            // Make the slot ready
+            let slot = &self.slots[slot_idx];
+            inner.key = Some(cache_key.clone());
+            slot.set_usage_count(1);
+
+            debug_assert!(
+                {
+                    let guard = inner.permit.lock().unwrap();
+                    guard.upgrade().is_none()
                },
+                "we hold a write lock, so, no one else should have a permit"
+            );
+
+            return Ok(WriteBufResult::NotFound(PageWriteGuard {
+                _permit: permit.take().unwrap(),
+                inner,
+                valid: false,
            }));
        }
    }
@@ -753,7 +775,7 @@ impl PageCache {
    ///
    /// Like 'search_mapping, but performs an "exact" search. Used for
    /// allocating a new buffer.
-    fn search_mapping_exact(&self, key: &CacheKey) -> Option<usize> {
+    fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
        match key {
            CacheKey::MaterializedPage { hash_key, lsn } => {
                let map = self.materialized_page_map.read().unwrap();
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -35,7 +35,6 @@ use std::time::Duration;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
-use tokio_util::sync::CancellationToken;
 use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
@@ -65,6 +64,69 @@ use crate::trace::Tracer;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

+fn copyin_stream<IO>(pgb: &mut PostgresBackend<IO>) -> impl Stream<Item = io::Result<Bytes>> + '_
+where
+    IO: AsyncRead + AsyncWrite + Unpin,
+{
+    async_stream::try_stream! {
+        loop {
+            let msg = tokio::select! {
+                biased;
+
+                _ = task_mgr::shutdown_watcher() => {
+                    // We were requested to shut down.
+                    let msg = "pageserver is shutting down";
+                    let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
+                    Err(QueryError::Other(anyhow::anyhow!(msg)))
+                }
+
+                msg = pgb.read_message() => { msg.map_err(QueryError::from)}
+            };
+
+            match msg {
+                Ok(Some(message)) => {
+                    let copy_data_bytes = match message {
+                        FeMessage::CopyData(bytes) => bytes,
+                        FeMessage::CopyDone => { break },
+                        FeMessage::Sync => continue,
+                        FeMessage::Terminate => {
+                            let msg = "client terminated connection with Terminate message during COPY";
+                            let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                            // error can't happen here, ErrorResponse serialization should be always ok
+                            pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
+                            Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                            break;
+                        }
+                        m => {
+                            let msg = format!("unexpected message {m:?}");
+                            // error can't happen here, ErrorResponse serialization should be always ok
+                            pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
+                            Err(io::Error::new(io::ErrorKind::Other, msg))?;
+                            break;
+                        }
+                    };
+
+                    yield copy_data_bytes;
+                }
+                Ok(None) => {
+                    let msg = "client closed connection during COPY";
+                    let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                    // error can't happen here, ErrorResponse serialization should be always ok
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
+                    pgb.flush().await?;
+                    Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                }
+                Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
+                    Err(io_error)?;
+                }
+                Err(other) => {
+                    Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
+                }
+            };
+        }
+    }
+}
+
 /// Read the end of a tar archive.
 ///
 /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
@@ -222,13 +284,7 @@ async fn page_service_conn_main(
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(
-        conf,
-        broker_client,
-        auth,
-        connection_ctx,
-        task_mgr::shutdown_token(),
-    );
+    let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;

    match pgbackend
@@ -262,10 +318,6 @@ struct PageServerHandler {
    /// For each query received over the connection,
    /// `process_query` creates a child context from this one.
    connection_ctx: RequestContext,
-
-    /// A token that should fire when the tenant transitions from
-    /// attached state, or when the pageserver is shutting down.
-    cancel: CancellationToken,
 }

 impl PageServerHandler {
@@ -274,7 +326,6 @@ impl PageServerHandler {
        broker_client: storage_broker::BrokerClientChannel,
        auth: Option<Arc<JwtAuth>>,
        connection_ctx: RequestContext,
-        cancel: CancellationToken,
    ) -> Self {
        PageServerHandler {
            _conf: conf,
@@ -282,91 +333,6 @@ impl PageServerHandler {
            auth,
            claims: None,
            connection_ctx,
-            cancel,
-        }
-    }
-
-    /// Wrap PostgresBackend::flush to respect our CancellationToken: it is important to use
-    /// this rather than naked flush() in order to shut down promptly.  Without this, we would
-    /// block shutdown of a tenant if a postgres client was failing to consume bytes we send
-    /// in the flush.
-    async fn flush_cancellable<IO>(&self, pgb: &mut PostgresBackend<IO>) -> Result<(), QueryError>
-    where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
-    {
-        tokio::select!(
-            flush_r = pgb.flush() => {
-                Ok(flush_r?)
-            },
-            _ = self.cancel.cancelled() => {
-                Err(QueryError::Other(anyhow::anyhow!("Shutting down")))
-            }
-        )
-    }
-
-    fn copyin_stream<'a, IO>(
-        &'a self,
-        pgb: &'a mut PostgresBackend<IO>,
-    ) -> impl Stream<Item = io::Result<Bytes>> + 'a
-    where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
-    {
-        async_stream::try_stream! {
-            loop {
-                let msg = tokio::select! {
-                    biased;
-
-                    _ = task_mgr::shutdown_watcher() => {
-                        // We were requested to shut down.
-                        let msg = "pageserver is shutting down";
-                        let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
-                        Err(QueryError::Other(anyhow::anyhow!(msg)))
-                    }
-
-                    msg = pgb.read_message() => { msg.map_err(QueryError::from)}
-                };
-
-                match msg {
-                    Ok(Some(message)) => {
-                        let copy_data_bytes = match message {
-                            FeMessage::CopyData(bytes) => bytes,
-                            FeMessage::CopyDone => { break },
-                            FeMessage::Sync => continue,
-                            FeMessage::Terminate => {
-                                let msg = "client terminated connection with Terminate message during COPY";
-                                let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                                // error can't happen here, ErrorResponse serialization should be always ok
-                                pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                                Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
-                                break;
-                            }
-                            m => {
-                                let msg = format!("unexpected message {m:?}");
-                                // error can't happen here, ErrorResponse serialization should be always ok
-                                pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
-                                Err(io::Error::new(io::ErrorKind::Other, msg))?;
-                                break;
-                            }
-                        };
-
-                        yield copy_data_bytes;
-                    }
-                    Ok(None) => {
-                        let msg = "client closed connection during COPY";
-                        let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                        // error can't happen here, ErrorResponse serialization should be always ok
-                        pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                        self.flush_cancellable(pgb).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
-                        Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
-                    }
-                    Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
-                        Err(io_error)?;
-                    }
-                    Err(other) => {
-                        Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
-                    }
-                };
-            }
        }
    }

@@ -406,7 +372,7 @@ impl PageServerHandler {

        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
-        self.flush_cancellable(pgb).await?;
+        pgb.flush().await?;

        let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);

@@ -499,7 +465,7 @@ impl PageServerHandler {
            });

            pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
-            self.flush_cancellable(pgb).await?;
+            pgb.flush().await?;
        }
        Ok(())
    }
@@ -542,9 +508,9 @@ impl PageServerHandler {
        // Import basebackup provided via CopyData
        info!("importing basebackup");
        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb).await?;
+        pgb.flush().await?;

-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
+        let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
        timeline
            .import_basebackup_from_tar(
                &mut copyin_reader,
@@ -597,8 +563,8 @@ impl PageServerHandler {
        // Import wal provided via CopyData
        info!("importing wal");
        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb).await?;
-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
+        pgb.flush().await?;
+        let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
        import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
        info!("wal import complete");

@@ -806,7 +772,7 @@ impl PageServerHandler {

        // switch client to COPYOUT
        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
-        self.flush_cancellable(pgb).await?;
+        pgb.flush().await?;

        // Send a tarball of the latest layer on the timeline. Compress if not
        // fullbackup. TODO Compress in that case too (tests need to be updated)
@@ -858,7 +824,7 @@ impl PageServerHandler {
        }

        pgb.write_message_noflush(&BeMessage::CopyDone)?;
-        self.flush_cancellable(pgb).await?;
+        pgb.flush().await?;

        let basebackup_after = started
            .elapsed()
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -23,14 +23,12 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::completion;
-use utils::completion::Completion;
 use utils::crashsafe::path_with_suffix_extension;

 use std::cmp::min;
 use std::collections::hash_map::Entry;
 use std::collections::BTreeSet;
 use std::collections::HashMap;
-use std::collections::HashSet;
 use std::fmt::Debug;
 use std::fmt::Display;
 use std::fs;
@@ -47,7 +45,6 @@ use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};

 use self::config::AttachedLocationConfig;
-use self::config::AttachmentMode;
 use self::config::LocationConf;
 use self::config::TenantConf;
 use self::delete::DeleteTenantFlow;
@@ -187,11 +184,6 @@ impl AttachedTenantConf {
        }
    }
 }
-struct TimelinePreload {
-    timeline_id: TimelineId,
-    client: RemoteTimelineClient,
-    index_part: Result<MaybeDeletedIndexPart, DownloadError>,
-}

 ///
 /// Tenant consists of multiple timelines. Keep them in a hash table.
@@ -216,7 +208,7 @@ pub struct Tenant {

    /// The remote storage generation, used to protect S3 objects from split-brain.
    /// Does not change over the lifetime of the [`Tenant`] object.
-    ///
+    ///  
    /// This duplicates the generation stored in LocationConf, but that structure is mutable:
    /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
    generation: Generation,
@@ -414,8 +406,6 @@ pub enum CreateTimelineError {
    AlreadyExists,
    #[error(transparent)]
    AncestorLsn(anyhow::Error),
-    #[error("ancestor timeline is not active")]
-    AncestorNotActive,
    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }
@@ -969,9 +959,6 @@ impl Tenant {
                let _completion = init_order
                    .as_mut()
                    .and_then(|x| x.initial_tenant_load.take());
-                let remote_load_completion = init_order
-                    .as_mut()
-                    .and_then(|x| x.initial_tenant_load_remote.take());

                // Dont block pageserver startup on figuring out deletion status
                let pending_deletion = {
@@ -996,7 +983,6 @@ impl Tenant {
                    // as we are no longer loading, signal completion by dropping
                    // the completion while we resume deletion
                    drop(_completion);
-                    drop(remote_load_completion);
                    // do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout
                    let _ = init_order
                        .as_mut()
@@ -1022,10 +1008,7 @@ impl Tenant {
                let background_jobs_can_start =
                    init_order.as_ref().map(|x| &x.background_jobs_can_start);

-                match tenant_clone
-                    .load(init_order.as_ref(), remote_load_completion, &ctx)
-                    .await
-                {
+                match tenant_clone.load(init_order.as_ref(), &ctx).await {
                    Ok(()) => {
                        debug!("load finished");

@@ -1189,52 +1172,6 @@ impl Tenant {
        })
    }

-    async fn load_timeline_metadata(
-        self: &Arc<Tenant>,
-        timeline_ids: HashSet<TimelineId>,
-        remote_storage: &GenericRemoteStorage,
-    ) -> anyhow::Result<HashMap<TimelineId, TimelinePreload>> {
-        let mut part_downloads = JoinSet::new();
-        for timeline_id in timeline_ids {
-            let client = RemoteTimelineClient::new(
-                remote_storage.clone(),
-                self.deletion_queue_client.clone(),
-                self.conf,
-                self.tenant_id,
-                timeline_id,
-                self.generation,
-            );
-            part_downloads.spawn(
-                async move {
-                    debug!("starting index part download");
-
-                    let index_part = client.download_index_file().await;
-
-                    debug!("finished index part download");
-
-                    Result::<_, anyhow::Error>::Ok(TimelinePreload {
-                        client,
-                        timeline_id,
-                        index_part,
-                    })
-                }
-                .map(move |res| {
-                    res.with_context(|| format!("download index part for timeline {timeline_id}"))
-                })
-                .instrument(info_span!("download_index_part", %timeline_id)),
-            );
-        }
-
-        let mut timeline_preloads: HashMap<TimelineId, TimelinePreload> = HashMap::new();
-        while let Some(result) = part_downloads.join_next().await {
-            let preload_result = result.context("join preload task")?;
-            let preload = preload_result?;
-            timeline_preloads.insert(preload.timeline_id, preload);
-        }
-
-        Ok(timeline_preloads)
-    }
-
    ///
    /// Background task to load in-memory data structures for this tenant, from
    /// files on disk. Used at pageserver startup.
@@ -1243,13 +1180,14 @@ impl Tenant {
    async fn load(
        self: &Arc<Tenant>,
        init_order: Option<&InitializationOrder>,
-        remote_completion: Option<Completion>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        span::debug_assert_current_span_has_tenant_id();

        debug!("loading tenant task");

+        crate::failpoint_support::sleep_millis_async!("before-loading-tenant");
+
        // Load in-memory state to reflect the local files on disk
        //
        // Scan the directory, peek into the metadata file of each timeline, and
@@ -1268,38 +1206,10 @@ impl Tenant {
        // FIXME original collect_timeline_files contained one more check:
        //    1. "Timeline has no ancestor and no layer files"

-        // Load remote content for timelines in this tenant
-        let all_timeline_ids = scan
-            .sorted_timelines_to_load
-            .iter()
-            .map(|i| i.0)
-            .chain(scan.timelines_to_resume_deletion.iter().map(|i| i.0))
-            .collect();
-        let mut preload = if let Some(remote_storage) = &self.remote_storage {
-            Some(
-                self.load_timeline_metadata(all_timeline_ids, remote_storage)
-                    .await?,
-            )
-        } else {
-            None
-        };
-
-        drop(remote_completion);
-
-        crate::failpoint_support::sleep_millis_async!("before-loading-tenant");
-
        // Process loadable timelines first
        for (timeline_id, local_metadata) in scan.sorted_timelines_to_load {
-            let timeline_preload = preload.as_mut().map(|p| p.remove(&timeline_id).unwrap());
            if let Err(e) = self
-                .load_local_timeline(
-                    timeline_id,
-                    local_metadata,
-                    timeline_preload,
-                    init_order,
-                    ctx,
-                    false,
-                )
+                .load_local_timeline(timeline_id, local_metadata, init_order, ctx, false)
                .await
            {
                match e {
@@ -1332,17 +1242,8 @@ impl Tenant {
                    }
                }
                Some(local_metadata) => {
-                    let timeline_preload =
-                        preload.as_mut().map(|p| p.remove(&timeline_id).unwrap());
                    if let Err(e) = self
-                        .load_local_timeline(
-                            timeline_id,
-                            local_metadata,
-                            timeline_preload,
-                            init_order,
-                            ctx,
-                            true,
-                        )
+                        .load_local_timeline(timeline_id, local_metadata, init_order, ctx, true)
                        .await
                    {
                        match e {
@@ -1370,12 +1271,11 @@ impl Tenant {
    /// Subroutine of `load_tenant`, to load an individual timeline
    ///
    /// NB: The parent is assumed to be already loaded!
-    #[instrument(skip(self, local_metadata, init_order, preload, ctx))]
+    #[instrument(skip(self, local_metadata, init_order, ctx))]
    async fn load_local_timeline(
        self: &Arc<Self>,
        timeline_id: TimelineId,
        local_metadata: TimelineMetadata,
-        preload: Option<TimelinePreload>,
        init_order: Option<&InitializationOrder>,
        ctx: &RequestContext,
        found_delete_mark: bool,
@@ -1384,81 +1284,74 @@ impl Tenant {

        let mut resources = self.build_timeline_resources(timeline_id);

-        let (remote_startup_data, remote_client) = match preload {
-            Some(preload) => {
-                let TimelinePreload {
-                    index_part,
-                    client: remote_client,
-                    timeline_id: _timeline_id,
-                } = preload;
-                match index_part {
-                    Ok(index_part) => {
-                        let index_part = match index_part {
-                            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
-                            MaybeDeletedIndexPart::Deleted(index_part) => {
-                                // TODO: we won't reach here if remote storage gets de-configured after start of the deletion operation.
-                                // Example:
-                                //  start deletion operation
-                                //  finishes upload of index part
-                                //  pageserver crashes
-                                //  remote storage gets de-configured
-                                //  pageserver starts
-                                //
-                                // We don't really anticipate remote storage to be de-configured, so, for now, this is fine.
-                                // Also, maybe we'll remove that option entirely in the future, see https://github.com/neondatabase/neon/issues/4099.
-                                info!("is_deleted is set on remote, resuming removal of timeline data originally done by timeline deletion handler");
+        let (remote_startup_data, remote_client) = match resources.remote_client {
+            Some(remote_client) => match remote_client.download_index_file().await {
+                Ok(index_part) => {
+                    let index_part = match index_part {
+                        MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
+                        MaybeDeletedIndexPart::Deleted(index_part) => {
+                            // TODO: we won't reach here if remote storage gets de-configured after start of the deletion operation.
+                            // Example:
+                            //  start deletion operation
+                            //  finishes upload of index part
+                            //  pageserver crashes
+                            //  remote storage gets de-configured
+                            //  pageserver starts
+                            //
+                            // We don't really anticipate remote storage to be de-configured, so, for now, this is fine.
+                            // Also, maybe we'll remove that option entirely in the future, see https://github.com/neondatabase/neon/issues/4099.
+                            info!("is_deleted is set on remote, resuming removal of timeline data originally done by timeline deletion handler");

-                                remote_client
-                                    .init_upload_queue_stopped_to_continue_deletion(&index_part)
-                                    .context("init queue stopped")
-                                    .map_err(LoadLocalTimelineError::ResumeDeletion)?;
-
-                                DeleteTimelineFlow::resume_deletion(
-                                    Arc::clone(self),
-                                    timeline_id,
-                                    &local_metadata,
-                                    Some(remote_client),
-                                    self.deletion_queue_client.clone(),
-                                    init_order,
-                                )
-                                .await
-                                .context("resume deletion")
+                            remote_client
+                                .init_upload_queue_stopped_to_continue_deletion(&index_part)
+                                .context("init queue stopped")
                                .map_err(LoadLocalTimelineError::ResumeDeletion)?;

-                                return Ok(());
-                            }
-                        };
-
-                        let remote_metadata = index_part.metadata.clone();
-                        (
-                            Some(RemoteStartupData {
-                                index_part,
-                                remote_metadata,
-                            }),
-                            Some(remote_client),
-                        )
-                    }
-                    Err(DownloadError::NotFound) => {
-                        info!("no index file was found on the remote, found_delete_mark: {found_delete_mark}");
-
-                        if found_delete_mark {
-                            // We could've resumed at a point where remote index was deleted, but metadata file wasnt.
-                            // Cleanup:
-                            return DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces(
-                                self,
+                            DeleteTimelineFlow::resume_deletion(
+                                Arc::clone(self),
                                timeline_id,
+                                &local_metadata,
+                                Some(remote_client),
+                                self.deletion_queue_client.clone(),
+                                init_order,
                            )
                            .await
-                            .context("cleanup_remaining_timeline_fs_traces")
-                            .map_err(LoadLocalTimelineError::ResumeDeletion);
-                        }
+                            .context("resume deletion")
+                            .map_err(LoadLocalTimelineError::ResumeDeletion)?;

-                        // We're loading fresh timeline that didnt yet make it into remote.
-                        (None, Some(remote_client))
-                    }
-                    Err(e) => return Err(LoadLocalTimelineError::Load(anyhow::Error::new(e))),
+                            return Ok(());
+                        }
+                    };
+
+                    let remote_metadata = index_part.metadata.clone();
+                    (
+                        Some(RemoteStartupData {
+                            index_part,
+                            remote_metadata,
+                        }),
+                        Some(remote_client),
+                    )
                }
-            }
+                Err(DownloadError::NotFound) => {
+                    info!("no index file was found on the remote, found_delete_mark: {found_delete_mark}");
+
+                    if found_delete_mark {
+                        // We could've resumed at a point where remote index was deleted, but metadata file wasnt.
+                        // Cleanup:
+                        return DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces(
+                            self,
+                            timeline_id,
+                        )
+                        .await
+                        .context("cleanup_remaining_timeline_fs_traces")
+                        .map_err(LoadLocalTimelineError::ResumeDeletion);
+                    }
+
+                    // We're loading fresh timeline that didnt yet make it into remote.
+                    (None, Some(remote_client))
+                }
+                Err(e) => return Err(LoadLocalTimelineError::Load(anyhow::Error::new(e))),
+            },
            None => {
                // No remote client
                if found_delete_mark {
@@ -1694,12 +1587,6 @@ impl Tenant {
                    .get_timeline(ancestor_timeline_id, false)
                    .context("Cannot branch off the timeline that's not present in pageserver")?;

-                // instead of waiting around, just deny the request because ancestor is not yet
-                // ready for other purposes either.
-                if !ancestor_timeline.is_active() {
-                    return Err(CreateTimelineError::AncestorNotActive);
-                }
-
                if let Some(lsn) = ancestor_start_lsn.as_mut() {
                    *lsn = lsn.align();

@@ -1732,6 +1619,8 @@ impl Tenant {
            }
        };

+        loaded_timeline.activate(broker_client, None, ctx);
+
        if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
            // Wait for the upload of the 'index_part.json` file to finish, so that when we return
            // Ok, the timeline is durable in remote storage.
@@ -1743,8 +1632,6 @@ impl Tenant {
            })?;
        }

-        loaded_timeline.activate(broker_client, None, ctx);
-
        Ok(loaded_timeline)
    }

@@ -2181,15 +2068,6 @@ impl Tenant {
            }
        }
    }
-
-    pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
-        self.tenant_conf
-            .read()
-            .unwrap()
-            .location
-            .attach_mode
-            .clone()
-    }
 }

 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
@@ -2859,11 +2737,6 @@ impl Tenant {
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
        let src_id = src_timeline.timeline_id;

-        // First acquire the GC lock so that another task cannot advance the GC
-        // cutoff in 'gc_info', and make 'start_lsn' invalid, while we are
-        // creating the branch.
-        let _gc_cs = self.gc_cs.lock().await;
-
        // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
        let start_lsn = start_lsn.unwrap_or_else(|| {
            let lsn = src_timeline.get_last_record_lsn();
@@ -2871,6 +2744,11 @@ impl Tenant {
            lsn
        });

+        // First acquire the GC lock so that another task cannot advance the GC
+        // cutoff in 'gc_info', and make 'start_lsn' invalid, while we are
+        // creating the branch.
+        let _gc_cs = self.gc_cs.lock().await;
+
        // Create a placeholder for the new branch. This will error
        // out if the new timeline ID is already in use.
        let timeline_uninit_mark = {
@@ -3552,8 +3430,11 @@ pub mod harness {

    use crate::deletion_queue::mock::MockDeletionQueue;
    use crate::{
-        config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord,
-        walredo::WalRedoManager,
+        config::PageServerConf,
+        repository::Key,
+        tenant::Tenant,
+        walrecord::NeonWalRecord,
+        walredo::{WalRedoError, WalRedoManager},
    };

    use super::*;
@@ -3698,7 +3579,7 @@ pub mod harness {
                self.deletion_queue.new_client(),
            ));
            tenant
-                .load(None, None, ctx)
+                .load(None, ctx)
                .instrument(info_span!("try_load", tenant_id=%self.tenant_id))
                .await?;

@@ -3726,7 +3607,7 @@ pub mod harness {
            base_img: Option<(Lsn, Bytes)>,
            records: Vec<(Lsn, NeonWalRecord)>,
            _pg_version: u32,
-        ) -> anyhow::Result<Bytes> {
+        ) -> Result<Bytes, WalRedoError> {
            let s = format!(
                "redo for {} to get to {}, with {} and {} records",
                key,
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -186,21 +186,26 @@ impl FileBlockReader {
        ctx: &RequestContext,
    ) -> Result<BlockLease, std::io::Error> {
        let cache = page_cache::get();
-        match cache
-            .read_immutable_buf(self.file_id, blknum, ctx)
-            .await
-            .map_err(|e| {
-                std::io::Error::new(
-                    std::io::ErrorKind::Other,
-                    format!("Failed to read immutable buf: {e:#}"),
-                )
-            })? {
-            ReadBufResult::Found(guard) => Ok(guard.into()),
-            ReadBufResult::NotFound(mut write_guard) => {
-                // Read the page from disk into the buffer
-                self.fill_buffer(write_guard.deref_mut(), blknum).await?;
-                Ok(write_guard.mark_valid().into())
-            }
+        loop {
+            match cache
+                .read_immutable_buf(self.file_id, blknum, ctx)
+                .await
+                .map_err(|e| {
+                    std::io::Error::new(
+                        std::io::ErrorKind::Other,
+                        format!("Failed to read immutable buf: {e:#}"),
+                    )
+                })? {
+                ReadBufResult::Found(guard) => break Ok(guard.into()),
+                ReadBufResult::NotFound(mut write_guard) => {
+                    // Read the page from disk into the buffer
+                    self.fill_buffer(write_guard.deref_mut(), blknum).await?;
+                    write_guard.mark_valid();
+
+                    // Swap for read lock
+                    continue;
+                }
+            };
        }
    }
 }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -31,7 +31,7 @@ use super::{
 const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;

 #[derive(Debug, thiserror::Error)]
-pub(crate) enum DeleteTenantError {
+pub enum DeleteTenantError {
    #[error("GetTenant {0}")]
    Get(#[from] GetTenantError),

@@ -376,7 +376,7 @@ impl DeleteTenantFlow {
        Ok(())
    }

-    pub(crate) async fn should_resume_deletion(
+    pub async fn should_resume_deletion(
        conf: &'static PageServerConf,
        remote_storage: Option<&GenericRemoteStorage>,
        tenant: &Tenant,
@@ -432,7 +432,7 @@ impl DeleteTenantFlow {
        // Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
        let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
        if timelines_path.exists() {
-            tenant.load(init_order, None, ctx).await.context("load")?;
+            tenant.load(init_order, ctx).await.context("load")?;
        }

        Self::background(
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -72,32 +72,36 @@ impl EphemeralFile {
        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
        if flushed_blknums.contains(&(blknum as u64)) {
            let cache = page_cache::get();
-            match cache
-                .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
-                .await
-                .map_err(|e| {
-                    std::io::Error::new(
-                        std::io::ErrorKind::Other,
-                        // order path before error because error is anyhow::Error => might have many contexts
-                        format!(
-                            "ephemeral file: read immutable page #{}: {}: {:#}",
-                            blknum, self.file.path, e,
-                        ),
-                    )
-                })? {
-                page_cache::ReadBufResult::Found(guard) => {
-                    return Ok(BlockLease::PageReadGuard(guard))
-                }
-                page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                    let buf: &mut [u8] = write_guard.deref_mut();
-                    debug_assert_eq!(buf.len(), PAGE_SZ);
-                    self.file
-                        .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
-                        .await?;
-                    let read_guard = write_guard.mark_valid();
-                    return Ok(BlockLease::PageReadGuard(read_guard));
-                }
-            };
+            loop {
+                match cache
+                    .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
+                    .await
+                    .map_err(|e| {
+                        std::io::Error::new(
+                            std::io::ErrorKind::Other,
+                            // order path before error because error is anyhow::Error => might have many contexts
+                            format!(
+                                "ephemeral file: read immutable page #{}: {}: {:#}",
+                                blknum, self.file.path, e,
+                            ),
+                        )
+                    })? {
+                    page_cache::ReadBufResult::Found(guard) => {
+                        return Ok(BlockLease::PageReadGuard(guard))
+                    }
+                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                        let buf: &mut [u8] = write_guard.deref_mut();
+                        debug_assert_eq!(buf.len(), PAGE_SZ);
+                        self.file
+                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
+                            .await?;
+                        write_guard.mark_valid();
+
+                        // Swap for read lock
+                        continue;
+                    }
+                };
+            }
        } else {
            debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
            Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
@@ -167,7 +171,7 @@ impl EphemeralFile {
                                        let buf: &mut [u8] = write_guard.deref_mut();
                                        debug_assert_eq!(buf.len(), PAGE_SZ);
                                        buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
-                                        let _ = write_guard.mark_valid();
+                                        write_guard.mark_valid();
                                        // pre-warm successful
                                    }
                                    Err(e) => {
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -24,7 +24,7 @@ use crate::control_plane_client::{
 };
 use crate::deletion_queue::DeletionQueueClient;
 use crate::task_mgr::{self, TaskKind};
-use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
+use crate::tenant::config::{LocationConf, LocationMode, TenantConfOpt};
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::{
    create_tenant_files, AttachedTenantConf, CreateTenantFilesMode, Tenant, TenantState,
@@ -50,7 +50,7 @@ use super::TenantSharedResources;
 /// its lifetime, and we can preserve some important safety invariants like `Tenant` always
 /// having a properly acquired generation (Secondary doesn't need a generation)
 #[derive(Clone)]
-pub(crate) enum TenantSlot {
+pub enum TenantSlot {
    Attached(Arc<Tenant>),
    Secondary,
 }
@@ -151,147 +151,6 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U

 static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::Initializing));

-fn emergency_generations(
-    tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
-) -> HashMap<TenantId, Generation> {
-    tenant_confs
-        .iter()
-        .filter_map(|(tid, lc)| {
-            let lc = match lc {
-                Ok(lc) => lc,
-                Err(_) => return None,
-            };
-            let gen = match &lc.mode {
-                LocationMode::Attached(alc) => Some(alc.generation),
-                LocationMode::Secondary(_) => None,
-            };
-
-            gen.map(|g| (*tid, g))
-        })
-        .collect()
-}
-
-async fn init_load_generations(
-    conf: &'static PageServerConf,
-    tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
-    resources: &TenantSharedResources,
-    cancel: &CancellationToken,
-) -> anyhow::Result<Option<HashMap<TenantId, Generation>>> {
-    let generations = if conf.control_plane_emergency_mode {
-        error!(
-            "Emergency mode!  Tenants will be attached unsafely using their last known generation"
-        );
-        emergency_generations(tenant_confs)
-    } else if let Some(client) = ControlPlaneClient::new(conf, cancel) {
-        info!("Calling control plane API to re-attach tenants");
-        // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
-        match client.re_attach().await {
-            Ok(tenants) => tenants,
-            Err(RetryForeverError::ShuttingDown) => {
-                anyhow::bail!("Shut down while waiting for control plane re-attach response")
-            }
-        }
-    } else {
-        info!("Control plane API not configured, tenant generations are disabled");
-        return Ok(None);
-    };
-
-    // The deletion queue needs to know about the startup attachment state to decide which (if any) stored
-    // deletion list entries may still be valid.  We provide that by pushing a recovery operation into
-    // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
-    // are processed, even though we don't block on recovery completing here.
-    //
-    // Must only do this if remote storage is enabled, otherwise deletion queue
-    // is not running and channel push will fail.
-    if resources.remote_storage.is_some() {
-        resources
-            .deletion_queue_client
-            .recover(generations.clone())?;
-    }
-
-    Ok(Some(generations))
-}
-
-/// Initial stage of load: walk the local tenants directory, clean up any temp files,
-/// and load configurations for the tenants we found.
-async fn init_load_tenant_configs(
-    conf: &'static PageServerConf,
-) -> anyhow::Result<HashMap<TenantId, anyhow::Result<LocationConf>>> {
-    let tenants_dir = conf.tenants_path();
-
-    let mut dir_entries = tenants_dir
-        .read_dir_utf8()
-        .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
-
-    let mut configs = HashMap::new();
-
-    loop {
-        match dir_entries.next() {
-            None => break,
-            Some(Ok(dentry)) => {
-                let tenant_dir_path = dentry.path().to_path_buf();
-                if crate::is_temporary(&tenant_dir_path) {
-                    info!("Found temporary tenant directory, removing: {tenant_dir_path}");
-                    // No need to use safe_remove_tenant_dir_all because this is already
-                    // a temporary path
-                    if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
-                        error!(
-                            "Failed to remove temporary directory '{}': {:?}",
-                            tenant_dir_path, e
-                        );
-                    }
-                    continue;
-                }
-
-                // This case happens if we:
-                // * crash during attach before creating the attach marker file
-                // * crash during tenant delete before removing tenant directory
-                let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
-                    format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
-                })?;
-                if is_empty {
-                    info!("removing empty tenant directory {tenant_dir_path:?}");
-                    if let Err(e) = fs::remove_dir(&tenant_dir_path).await {
-                        error!(
-                            "Failed to remove empty tenant directory '{}': {e:#}",
-                            tenant_dir_path
-                        )
-                    }
-                    continue;
-                }
-
-                let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
-                if tenant_ignore_mark_file.exists() {
-                    info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
-                    continue;
-                }
-
-                let tenant_id = match tenant_dir_path
-                    .file_name()
-                    .unwrap_or_default()
-                    .parse::<TenantId>()
-                {
-                    Ok(id) => id,
-                    Err(_) => {
-                        warn!(
-                            "Invalid tenant path (garbage in our repo directory?): {tenant_dir_path}",
-                        );
-                        continue;
-                    }
-                };
-
-                configs.insert(tenant_id, Tenant::load_tenant_config(conf, &tenant_id));
-            }
-            Some(Err(e)) => {
-                // An error listing the top level directory indicates serious problem
-                // with local filesystem: we will fail to load, and fail to start.
-                anyhow::bail!(e);
-            }
-        }
-    }
-    Ok(configs)
-}
-
 /// Initialize repositories with locally available timelines.
 /// Timelines that are only partially available locally (remote storage has more data than this pageserver)
 /// are scheduled for download and added to the tenant once download is completed.
@@ -302,96 +161,196 @@ pub async fn init_tenant_mgr(
    init_order: InitializationOrder,
    cancel: CancellationToken,
 ) -> anyhow::Result<()> {
+    // Scan local filesystem for attached tenants
+    let tenants_dir = conf.tenants_path();
+
    let mut tenants = HashMap::new();

+    // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
+    let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) {
+        let result = match client.re_attach().await {
+            Ok(tenants) => tenants,
+            Err(RetryForeverError::ShuttingDown) => {
+                anyhow::bail!("Shut down while waiting for control plane re-attach response")
+            }
+        };
+
+        // The deletion queue needs to know about the startup attachment state to decide which (if any) stored
+        // deletion list entries may still be valid.  We provide that by pushing a recovery operation into
+        // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
+        // are processed, even though we don't block on recovery completing here.
+        //
+        // Must only do this if remote storage is enabled, otherwise deletion queue
+        // is not running and channel push will fail.
+        if resources.remote_storage.is_some() {
+            resources
+                .deletion_queue_client
+                .recover(result.clone())
+                .await?;
+        }
+
+        Some(result)
+    } else {
+        info!("Control plane API not configured, tenant generations are disabled");
+        None
+    };
+
+    let mut dir_entries = tenants_dir
+        .read_dir_utf8()
+        .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
+
    let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);

-    // Scan local filesystem for attached tenants
-    let tenant_configs = init_load_tenant_configs(conf).await?;
+    loop {
+        match dir_entries.next() {
+            None => break,
+            Some(Ok(dir_entry)) => {
+                let tenant_dir_path = dir_entry.path().to_path_buf();
+                if crate::is_temporary(&tenant_dir_path) {
+                    info!("Found temporary tenant directory, removing: {tenant_dir_path}");
+                    // No need to use safe_remove_tenant_dir_all because this is already
+                    // a temporary path
+                    if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
+                        error!(
+                            "Failed to remove temporary directory '{}': {:?}",
+                            tenant_dir_path, e
+                        );
+                    }
+                } else {
+                    // This case happens if we:
+                    // * crash during attach before creating the attach marker file
+                    // * crash during tenant delete before removing tenant directory
+                    let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
+                        format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
+                    })?;
+                    if is_empty {
+                        info!("removing empty tenant directory {tenant_dir_path:?}");
+                        if let Err(e) = fs::remove_dir(&tenant_dir_path).await {
+                            error!(
+                                "Failed to remove empty tenant directory '{}': {e:#}",
+                                tenant_dir_path
+                            )
+                        }
+                        continue;
+                    }

-    // Determine which tenants are to be attached
-    let tenant_generations =
-        init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
+                    let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
+                    if tenant_ignore_mark_file.exists() {
+                        info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
+                        continue;
+                    }

-    // Construct `Tenant` objects and start them running
-    for (tenant_id, location_conf) in tenant_configs {
-        let tenant_dir_path = conf.tenant_path(&tenant_id);
+                    let tenant_id = match tenant_dir_path
+                        .file_name()
+                        .unwrap_or_default()
+                        .parse::<TenantId>()
+                    {
+                        Ok(id) => id,
+                        Err(_) => {
+                            warn!(
+                                "Invalid tenant path (garbage in our repo directory?): {}",
+                                tenant_dir_path
+                            );
+                            continue;
+                        }
+                    };

-        let mut location_conf = match location_conf {
-            Ok(l) => l,
-            Err(e) => {
-                warn!(%tenant_id, "Marking tenant broken, failed to {e:#}");
+                    // Try loading the location configuration
+                    let mut location_conf = match Tenant::load_tenant_config(conf, &tenant_id)
+                        .context("load tenant config")
+                    {
+                        Ok(c) => c,
+                        Err(e) => {
+                            warn!("Marking tenant broken, failed to {e:#}");

-                tenants.insert(
-                    tenant_id,
-                    TenantSlot::Attached(Tenant::create_broken_tenant(
+                            tenants.insert(
+                                tenant_id,
+                                TenantSlot::Attached(Tenant::create_broken_tenant(
+                                    conf,
+                                    tenant_id,
+                                    "error loading tenant location configuration".to_string(),
+                                )),
+                            );
+
+                            continue;
+                        }
+                    };
+
+                    let generation = if let Some(generations) = &tenant_generations {
+                        // We have a generation map: treat it as the authority for whether
+                        // this tenant is really attached.
+                        if let Some(gen) = generations.get(&tenant_id) {
+                            *gen
+                        } else {
+                            match &location_conf.mode {
+                                LocationMode::Secondary(_) => {
+                                    // We do not require the control plane's permission for secondary mode
+                                    // tenants, because they do no remote writes and hence require no
+                                    // generation number
+                                    info!("Loaded tenant {tenant_id} in secondary mode");
+                                    tenants.insert(tenant_id, TenantSlot::Secondary);
+                                }
+                                LocationMode::Attached(_) => {
+                                    // TODO: augment re-attach API to enable the control plane to
+                                    // instruct us about secondary attachments.  That way, instead of throwing
+                                    // away local state, we can gracefully fall back to secondary here, if the control
+                                    // plane tells us so.
+                                    // (https://github.com/neondatabase/neon/issues/5377)
+                                    info!("Detaching tenant {tenant_id}, control plane omitted it in re-attach response");
+                                    if let Err(e) =
+                                        safe_remove_tenant_dir_all(&tenant_dir_path).await
+                                    {
+                                        error!(
+                                            "Failed to remove detached tenant directory '{}': {:?}",
+                                            tenant_dir_path, e
+                                        );
+                                    }
+                                }
+                            };
+
+                            continue;
+                        }
+                    } else {
+                        // Legacy mode: no generation information, any tenant present
+                        // on local disk may activate
+                        info!(
+                            "Starting tenant {} in legacy mode, no generation",
+                            tenant_dir_path
+                        );
+                        Generation::none()
+                    };
+
+                    // Presence of a generation number implies attachment: attach the tenant
+                    // if it wasn't already, and apply the generation number.
+                    location_conf.attach_in_generation(generation);
+                    Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;
+
+                    match schedule_local_tenant_processing(
                        conf,
                        tenant_id,
-                        format!("{}", e),
-                    )),
-                );
-                continue;
-            }
-        };
-
-        let generation = if let Some(generations) = &tenant_generations {
-            // We have a generation map: treat it as the authority for whether
-            // this tenant is really attached.
-            if let Some(gen) = generations.get(&tenant_id) {
-                *gen
-            } else {
-                match &location_conf.mode {
-                    LocationMode::Secondary(_) => {
-                        // We do not require the control plane's permission for secondary mode
-                        // tenants, because they do no remote writes and hence require no
-                        // generation number
-                        info!(%tenant_id, "Loaded tenant in secondary mode");
-                        tenants.insert(tenant_id, TenantSlot::Secondary);
-                    }
-                    LocationMode::Attached(_) => {
-                        // TODO: augment re-attach API to enable the control plane to
-                        // instruct us about secondary attachments.  That way, instead of throwing
-                        // away local state, we can gracefully fall back to secondary here, if the control
-                        // plane tells us so.
-                        // (https://github.com/neondatabase/neon/issues/5377)
-                        info!(%tenant_id, "Detaching tenant, control plane omitted it in re-attach response");
-                        if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
-                            error!(%tenant_id,
-                                "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
-                            );
+                        &tenant_dir_path,
+                        AttachedTenantConf::try_from(location_conf)?,
+                        resources.clone(),
+                        Some(init_order.clone()),
+                        &TENANTS,
+                        &ctx,
+                    ) {
+                        Ok(tenant) => {
+                            tenants.insert(tenant.tenant_id(), TenantSlot::Attached(tenant));
+                        }
+                        Err(e) => {
+                            error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}");
                        }
                    }
-                };
-
-                continue;
+                }
            }
-        } else {
-            // Legacy mode: no generation information, any tenant present
-            // on local disk may activate
-            info!(%tenant_id, "Starting tenant in legacy mode, no generation",);
-            Generation::none()
-        };
-
-        // Presence of a generation number implies attachment: attach the tenant
-        // if it wasn't already, and apply the generation number.
-        location_conf.attach_in_generation(generation);
-        Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;
-
-        match schedule_local_tenant_processing(
-            conf,
-            tenant_id,
-            &tenant_dir_path,
-            AttachedTenantConf::try_from(location_conf)?,
-            resources.clone(),
-            Some(init_order.clone()),
-            &TENANTS,
-            &ctx,
-        ) {
-            Ok(tenant) => {
-                tenants.insert(tenant.tenant_id(), TenantSlot::Attached(tenant));
-            }
-            Err(e) => {
-                error!(%tenant_id, "Failed to start tenant: {e:#}");
+            Some(Err(e)) => {
+                // On error, print it, but continue with the other tenants. If we error out
+                // here, the pageserver startup fails altogether, causing outage for *all*
+                // tenants. That seems worse.
+                error!(
+                    "Failed to list tenants dir entry in directory {tenants_dir:?}, reason: {e:?}"
+                );
            }
        }
    }
@@ -481,7 +440,7 @@ pub(crate) fn schedule_local_tenant_processing(
 /// management API. For example, it could attach the tenant on a different pageserver.
 /// We would then be in split-brain once this pageserver restarts.
 #[instrument(skip_all)]
-pub(crate) async fn shutdown_all_tenants() {
+pub async fn shutdown_all_tenants() {
    shutdown_all_tenants0(&TENANTS).await
 }

@@ -593,7 +552,7 @@ async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
    // caller will log how long we took
 }

-pub(crate) async fn create_tenant(
+pub async fn create_tenant(
    conf: &'static PageServerConf,
    tenant_conf: TenantConfOpt,
    tenant_id: TenantId,
@@ -628,14 +587,14 @@ pub(crate) async fn create_tenant(
 }

 #[derive(Debug, thiserror::Error)]
-pub(crate) enum SetNewTenantConfigError {
+pub enum SetNewTenantConfigError {
    #[error(transparent)]
    GetTenant(#[from] GetTenantError),
    #[error(transparent)]
    Persist(anyhow::Error),
 }

-pub(crate) async fn set_new_tenant_config(
+pub async fn set_new_tenant_config(
    conf: &'static PageServerConf,
    new_tenant_conf: TenantConfOpt,
    tenant_id: TenantId,
@@ -694,18 +653,6 @@ pub(crate) async fn upsert_location(

    if let Some(tenant) = shutdown_tenant {
        let (_guard, progress) = utils::completion::channel();
-
-        match tenant.get_attach_mode() {
-            AttachmentMode::Single | AttachmentMode::Multi => {
-                // Before we leave our state as the presumed holder of the latest generation,
-                // flush any outstanding deletions to reduce the risk of leaking objects.
-                deletion_queue_client.flush_advisory()
-            }
-            AttachmentMode::Stale => {
-                // If we're stale there's not point trying to flush deletions
-            }
-        };
-
        info!("Shutting down attached tenant");
        match tenant.shutdown(progress, false).await {
            Ok(()) => {}
@@ -776,7 +723,7 @@ pub(crate) async fn upsert_location(
 }

 #[derive(Debug, thiserror::Error)]
-pub(crate) enum GetTenantError {
+pub enum GetTenantError {
    #[error("Tenant {0} not found")]
    NotFound(TenantId),
    #[error("Tenant {0} is not active")]
@@ -792,7 +739,7 @@ pub(crate) enum GetTenantError {
 /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
 ///
 /// This method is cancel-safe.
-pub(crate) async fn get_tenant(
+pub async fn get_tenant(
    tenant_id: TenantId,
    active_only: bool,
 ) -> Result<Arc<Tenant>, GetTenantError> {
@@ -817,7 +764,7 @@ pub(crate) async fn get_tenant(
    }
 }

-pub(crate) async fn delete_tenant(
+pub async fn delete_tenant(
    conf: &'static PageServerConf,
    remote_storage: Option<GenericRemoteStorage>,
    tenant_id: TenantId,
@@ -826,7 +773,7 @@ pub(crate) async fn delete_tenant(
 }

 #[derive(Debug, thiserror::Error)]
-pub(crate) enum DeleteTimelineError {
+pub enum DeleteTimelineError {
    #[error("Tenant {0}")]
    Tenant(#[from] GetTenantError),

@@ -834,7 +781,7 @@ pub(crate) enum DeleteTimelineError {
    Timeline(#[from] crate::tenant::DeleteTimelineError),
 }

-pub(crate) async fn delete_timeline(
+pub async fn delete_timeline(
    tenant_id: TenantId,
    timeline_id: TimelineId,
    _ctx: &RequestContext,
@@ -845,29 +792,23 @@ pub(crate) async fn delete_timeline(
 }

 #[derive(Debug, thiserror::Error)]
-pub(crate) enum TenantStateError {
+pub enum TenantStateError {
    #[error("Tenant {0} not found")]
    NotFound(TenantId),
    #[error("Tenant {0} is stopping")]
    IsStopping(TenantId),
+    #[error("Tenant {0} is not active")]
+    NotActive(TenantId),
    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }

-pub(crate) async fn detach_tenant(
+pub async fn detach_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
    detach_ignored: bool,
-    deletion_queue_client: &DeletionQueueClient,
 ) -> Result<(), TenantStateError> {
-    let tmp_path = detach_tenant0(
-        conf,
-        &TENANTS,
-        tenant_id,
-        detach_ignored,
-        deletion_queue_client,
-    )
-    .await?;
+    let tmp_path = detach_tenant0(conf, &TENANTS, tenant_id, detach_ignored).await?;
    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
    let task_tenant_id = None;
@@ -892,7 +833,6 @@ async fn detach_tenant0(
    tenants: &tokio::sync::RwLock<TenantsMap>,
    tenant_id: TenantId,
    detach_ignored: bool,
-    deletion_queue_client: &DeletionQueueClient,
 ) -> Result<Utf8PathBuf, TenantStateError> {
    let tenant_dir_rename_operation = |tenant_id_to_clean| async move {
        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
@@ -904,10 +844,6 @@ async fn detach_tenant0(
    let removal_result =
        remove_tenant_from_memory(tenants, tenant_id, tenant_dir_rename_operation(tenant_id)).await;

-    // Flush pending deletions, so that they have a good chance of passing validation
-    // before this tenant is potentially re-attached elsewhere.
-    deletion_queue_client.flush_advisory();
-
    // Ignored tenants are not present in memory and will bail the removal from memory operation.
    // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
    if detach_ignored && matches!(removal_result, Err(TenantStateError::NotFound(_))) {
@@ -924,7 +860,7 @@ async fn detach_tenant0(
    removal_result
 }

-pub(crate) async fn load_tenant(
+pub async fn load_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
    generation: Generation,
@@ -961,7 +897,7 @@ pub(crate) async fn load_tenant(
    Ok(())
 }

-pub(crate) async fn ignore_tenant(
+pub async fn ignore_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
 ) -> Result<(), TenantStateError> {
@@ -989,7 +925,7 @@ async fn ignore_tenant0(
 }

 #[derive(Debug, thiserror::Error)]
-pub(crate) enum TenantMapListError {
+pub enum TenantMapListError {
    #[error("tenant map is still initiailizing")]
    Initializing,
 }
@@ -997,7 +933,7 @@ pub(crate) enum TenantMapListError {
 ///
 /// Get list of tenants, for the mgmt API
 ///
-pub(crate) async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapListError> {
+pub async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapListError> {
    let tenants = TENANTS.read().await;
    let m = match &*tenants {
        TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
@@ -1015,7 +951,7 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, Tenan
 ///
 /// Downloading all the tenant data is performed in the background, this merely
 /// spawns the background task and returns quickly.
-pub(crate) async fn attach_tenant(
+pub async fn attach_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
    generation: Generation,
@@ -1052,7 +988,7 @@ pub(crate) async fn attach_tenant(
 }

 #[derive(Debug, thiserror::Error)]
-pub(crate) enum TenantMapInsertError {
+pub enum TenantMapInsertError {
    #[error("tenant map is still initializing")]
    StillInitializing,
    #[error("tenant map is shutting down")]
@@ -1215,7 +1151,7 @@ use {
    utils::http::error::ApiError,
 };

-pub(crate) async fn immediate_gc(
+pub async fn immediate_gc(
    tenant_id: TenantId,
    timeline_id: TimelineId,
    gc_req: TimelineGcRequest,
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -901,27 +901,9 @@ impl RemoteTimelineClient {
        .await
        .context("list prefixes")?;

-        // We will delete the current index_part object last, since it acts as a deletion
-        // marker via its deleted_at attribute
-        let latest_index = remaining
-            .iter()
-            .filter(|p| {
-                p.object_name()
-                    .map(|n| n.starts_with(IndexPart::FILE_NAME))
-                    .unwrap_or(false)
-            })
-            .filter_map(|path| parse_remote_index_path(path.clone()).map(|gen| (path, gen)))
-            .max_by_key(|i| i.1)
-            .map(|i| i.0.clone())
-            .unwrap_or(
-                // No generation-suffixed indices, assume we are dealing with
-                // a legacy index.
-                remote_index_path(&self.tenant_id, &self.timeline_id, Generation::none()),
-            );
-
-        let remaining_layers: Vec<RemotePath> = remaining
+        let remaining: Vec<RemotePath> = remaining
            .into_iter()
-            .filter(|p| p!= &latest_index)
+            .filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
            .inspect(|path| {
                if let Some(name) = path.object_name() {
                    info!(%name, "deleting a file not referenced from index_part.json");
@@ -931,11 +913,9 @@ impl RemoteTimelineClient {
            })
            .collect();

-        let not_referenced_count = remaining_layers.len();
-        if !remaining_layers.is_empty() {
-            self.deletion_queue_client
-                .push_immediate(remaining_layers)
-                .await?;
+        let not_referenced_count = remaining.len();
+        if !remaining.is_empty() {
+            self.deletion_queue_client.push_immediate(remaining).await?;
        }

        fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -944,9 +924,11 @@ impl RemoteTimelineClient {
            ))?
        });

+        let index_file_path = timeline_storage_path.join(Utf8Path::new(IndexPart::FILE_NAME));
+
        debug!("enqueuing index part deletion");
        self.deletion_queue_client
-            .push_immediate([latest_index].to_vec())
+            .push_immediate([index_file_path].to_vec())
            .await?;

        // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
@@ -1419,13 +1401,6 @@ impl RemoteTimelineClient {
            }
        }
    }
-
-    pub(crate) fn get_layer_metadata(
-        &self,
-        name: &LayerFileName,
-    ) -> anyhow::Result<Option<LayerFileMetadata>> {
-        self.upload_queue.lock().unwrap().get_layer_metadata(name)
-    }
 }

 pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath {
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -31,7 +31,6 @@ pub(super) async fn upload_index_part<'a>(
    fail_point!("before-upload-index", |_| {
        bail!("failpoint before-upload-index")
    });
-    pausable_failpoint!("before-upload-index-pausable");

    let index_part_bytes =
        serde_json::to_vec(&index_part).context("serialize index part file into bytes")?;
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -511,7 +511,8 @@ impl DeltaLayer {
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
    pub fn new_for_path(path: &Utf8Path, file: File) -> Result<Self> {
-        let mut summary_buf = vec![0; PAGE_SZ];
+        let mut summary_buf = Vec::new();
+        summary_buf.resize(PAGE_SZ, 0);
        file.read_exact_at(&mut summary_buf, 0)?;
        let summary = Summary::des_prefix(&summary_buf)?;

--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -400,7 +400,8 @@ impl ImageLayer {
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
    pub fn new_for_path(path: &Utf8Path, file: File) -> Result<ImageLayer> {
-        let mut summary_buf = vec![0; PAGE_SZ];
+        let mut summary_buf = Vec::new();
+        summary_buf.resize(PAGE_SZ, 0);
        file.read_exact_at(&mut summary_buf, 0)?;
        let summary = Summary::des_prefix(&summary_buf)?;
        let metadata = file
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -14,73 +14,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::completion;

-static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
-    once_cell::sync::Lazy::new(|| {
-        let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
-        let permits = usize::max(
-            1,
-            // while a lot of the work is done on spawn_blocking, we still do
-            // repartitioning in the async context. this should give leave us some workers
-            // unblocked to be blocked on other work, hopefully easing any outside visible
-            // effects of restarts.
-            //
-            // 6/8 is a guess; previously we ran with unlimited 8 and more from
-            // spawn_blocking.
-            (total_threads * 3).checked_div(4).unwrap_or(0),
-        );
-        assert_ne!(permits, 0, "we will not be adding in permits later");
-        assert!(
-            permits < total_threads,
-            "need threads avail for shorter work"
-        );
-        tokio::sync::Semaphore::new(permits)
-    });
-
-#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr)]
-#[strum(serialize_all = "snake_case")]
-pub(crate) enum BackgroundLoopKind {
-    Compaction,
-    Gc,
-    Eviction,
-    ConsumptionMetricsCollectMetrics,
-    ConsumptionMetricsSyntheticSizeWorker,
-}
-
-impl BackgroundLoopKind {
-    fn as_static_str(&self) -> &'static str {
-        let s: &'static str = self.into();
-        s
-    }
-}
-
-pub(crate) enum RateLimitError {
-    Cancelled,
-}
-
-pub(crate) async fn concurrent_background_tasks_rate_limit(
-    loop_kind: BackgroundLoopKind,
-    _ctx: &RequestContext,
-    cancel: &CancellationToken,
-) -> Result<impl Drop, RateLimitError> {
-    crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT
-        .with_label_values(&[loop_kind.as_static_str()])
-        .inc();
-    scopeguard::defer!(
-        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc();
-    );
-    tokio::select! {
-        permit = CONCURRENT_BACKGROUND_TASKS.acquire() => {
-            match permit {
-                Ok(permit) => Ok(permit),
-                Err(_closed) => unreachable!("we never close the semaphore"),
-            }
-        },
-        _ = cancel.cancelled() => {
-            Err(RateLimitError::Cancelled)
-        }
-    }
-}
-
 /// Start per tenant background loops: compaction and gc.
 pub fn start_background_loops(
    tenant: &Arc<Tenant>,
@@ -183,7 +116,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            };

-            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
+            warn_when_period_overrun(started_at.elapsed(), period, "compaction");

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -251,7 +184,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            };

-            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
+            warn_when_period_overrun(started_at.elapsed(), period, "gc");

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -325,11 +258,7 @@ pub(crate) async fn random_init_delay(
 }

 /// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
-pub(crate) fn warn_when_period_overrun(
-    elapsed: Duration,
-    period: Duration,
-    task: BackgroundLoopKind,
-) {
+pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
    // Duration::ZERO will happen because it's the "disable [bgtask]" value.
    if elapsed >= period && period != Duration::ZERO {
        // humantime does no significant digits clamping whereas Duration's debug is a bit more
@@ -338,11 +267,11 @@ pub(crate) fn warn_when_period_overrun(
        warn!(
            ?elapsed,
            period = %humantime::format_duration(period),
-            ?task,
+            task,
            "task iteration took longer than the configured period"
        );
        crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
-            .with_label_values(&[task.as_static_str(), &format!("{}", period.as_secs())])
+            .with_label_values(&[task, &format!("{}", period.as_secs())])
            .inc();
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -44,7 +44,6 @@ use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
    DeltaLayerWriter, ImageLayerWriter, InMemoryLayer, LayerAccessStats, LayerFileName, RemoteLayer,
 };
-use crate::tenant::tasks::{BackgroundLoopKind, RateLimitError};
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
    layer_map::{LayerMap, SearchResult},
@@ -159,7 +158,7 @@ pub struct Timeline {

    /// The generation of the tenant that instantiated us: this is used for safety when writing remote objects.
    /// Never changes for the lifetime of this [`Timeline`] object.
-    ///
+    ///  
    /// This duplicates the generation stored in LocationConf, but that structure is mutable:
    /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
    generation: Generation,
@@ -371,7 +370,7 @@ pub enum PageReconstructError {

    /// An error happened replaying WAL records
    #[error(transparent)]
-    WalRedo(anyhow::Error),
+    WalRedo(#[from] crate::walredo::WalRedoError),
 }

 impl std::fmt::Debug for PageReconstructError {
@@ -685,17 +684,37 @@ impl Timeline {
    ) -> anyhow::Result<()> {
        const ROUNDS: usize = 2;

+        static CONCURRENT_COMPACTIONS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
+            once_cell::sync::Lazy::new(|| {
+                let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
+                let permits = usize::max(
+                    1,
+                    // while a lot of the work is done on spawn_blocking, we still do
+                    // repartitioning in the async context. this should give leave us some workers
+                    // unblocked to be blocked on other work, hopefully easing any outside visible
+                    // effects of restarts.
+                    //
+                    // 6/8 is a guess; previously we ran with unlimited 8 and more from
+                    // spawn_blocking.
+                    (total_threads * 3).checked_div(4).unwrap_or(0),
+                );
+                assert_ne!(permits, 0, "we will not be adding in permits later");
+                assert!(
+                    permits < total_threads,
+                    "need threads avail for shorter work"
+                );
+                tokio::sync::Semaphore::new(permits)
+            });
+
        // this wait probably never needs any "long time spent" logging, because we already nag if
        // compaction task goes over it's period (20s) which is quite often in production.
-        let _permit = match super::tasks::concurrent_background_tasks_rate_limit(
-            BackgroundLoopKind::Compaction,
-            ctx,
-            cancel,
-        )
-        .await
-        {
-            Ok(permit) => permit,
-            Err(RateLimitError::Cancelled) => return Ok(()),
+        let _permit = tokio::select! {
+            permit = CONCURRENT_COMPACTIONS.acquire() => {
+                permit
+            },
+            _ = cancel.cancelled() => {
+                return Ok(());
+            }
        };

        let last_record_lsn = self.get_last_record_lsn();
@@ -1275,23 +1294,7 @@ impl Timeline {
                Ok(delta) => Some(delta),
            };

-        // RemoteTimelineClient holds the metadata on layers' remote generations, so
-        // query it to construct a RemoteLayer.
-        let layer_metadata = self
-            .remote_client
-            .as_ref()
-            .expect("Eviction is not called without remote storage")
-            .get_layer_metadata(&local_layer.filename())
-            .map_err(EvictionError::LayerNotFound)?
-            .ok_or_else(|| {
-                EvictionError::LayerNotFound(anyhow::anyhow!("Layer not in remote metadata"))
-            })?;
-        if layer_metadata.file_size() != layer_file_size {
-            return Err(EvictionError::MetadataInconsistency(format!(
-                "Layer size {layer_file_size} doesn't match remote metadata file size {}",
-                layer_metadata.file_size()
-            )));
-        }
+        let layer_metadata = LayerFileMetadata::new(layer_file_size, self.generation);

        let new_remote_layer = Arc::new(match local_layer.filename() {
            LayerFileName::Image(image_name) => RemoteLayer::new_img(
@@ -1370,10 +1373,6 @@ pub(crate) enum EvictionError {
    /// different objects in memory.
    #[error("layer was no longer part of LayerMap")]
    LayerNotFound(#[source] anyhow::Error),
-
-    /// This should never happen
-    #[error("Metadata inconsistency")]
-    MetadataInconsistency(String),
 }

 /// Number of times we will compute partition within a checkpoint distance.
@@ -2364,7 +2363,7 @@ impl Timeline {
                // during branch creation.
                match ancestor.wait_to_become_active(ctx).await {
                    Ok(()) => {}
-                    Err(TimelineState::Stopping) => {
+                    Err(state) if state == TimelineState::Stopping => {
                        return Err(PageReconstructError::AncestorStopping(ancestor.timeline_id));
                    }
                    Err(state) => {
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,7 +30,6 @@ use crate::{
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
        storage_layer::PersistentLayer,
-        tasks::{BackgroundLoopKind, RateLimitError},
        timeline::EvictionError,
        LogicalSizeCalculationCause, Tenant,
    },
@@ -130,11 +129,7 @@ impl Timeline {
                    ControlFlow::Continue(()) => (),
                }
                let elapsed = start.elapsed();
-                crate::tenant::tasks::warn_when_period_overrun(
-                    elapsed,
-                    p.period,
-                    BackgroundLoopKind::Eviction,
-                );
+                crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction");
                crate::metrics::EVICTION_ITERATION_DURATION
                    .get_metric_with_label_values(&[
                        &format!("{}", p.period.as_secs()),
@@ -155,17 +150,6 @@ impl Timeline {
    ) -> ControlFlow<()> {
        let now = SystemTime::now();

-        let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit(
-            BackgroundLoopKind::Eviction,
-            ctx,
-            cancel,
-        )
-        .await
-        {
-            Ok(permit) => permit,
-            Err(RateLimitError::Cancelled) => return ControlFlow::Break(()),
-        };
-
        // If we evict layers but keep cached values derived from those layers, then
        // we face a storm of on-demand downloads after pageserver restart.
        // The reason is that the restart empties the caches, and so, the values
@@ -301,10 +285,6 @@ impl Timeline {
                    warn!(layer = %l, "failed to evict layer: {e}");
                    stats.not_evictable += 1;
                }
-                Some(Err(EvictionError::MetadataInconsistency(detail))) => {
-                    warn!(layer = %l, "failed to evict layer: {detail}");
-                    stats.not_evictable += 1;
-                }
            }
        }
        if stats.candidates == stats.not_evictable {
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -203,18 +203,6 @@ impl UploadQueue {
            UploadQueue::Stopped(stopped) => Ok(stopped),
        }
    }
-
-    pub(crate) fn get_layer_metadata(
-        &self,
-        name: &LayerFileName,
-    ) -> anyhow::Result<Option<LayerFileMetadata>> {
-        match self {
-            UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
-                anyhow::bail!("queue is in state {}", self.as_str())
-            }
-            UploadQueue::Initialized(inner) => Ok(inner.latest_files.get(name).cloned()),
-        }
-    }
 }

 /// An in-progress upload or delete task.
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -18,13 +18,11 @@
 //! any WAL records, so that even if an attacker hijacks the Postgres
 //! process, he cannot escape out of it.
 //!
-use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, Bytes, BytesMut};
 use nix::poll::*;
 use serde::Serialize;
 use std::collections::VecDeque;
-use std::io;
 use std::io::prelude::*;
 use std::io::{Error, ErrorKind};
 use std::ops::{Deref, DerefMut};
@@ -35,13 +33,14 @@ use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
 use std::sync::{Mutex, MutexGuard};
 use std::time::Duration;
 use std::time::Instant;
+use std::{fs, io};
 use tracing::*;
+use utils::crashsafe::path_with_suffix_extension;
 use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};

 #[cfg(feature = "testing")]
 use std::sync::atomic::{AtomicUsize, Ordering};

-use crate::config::PageServerConf;
 use crate::metrics::{
    WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
    WAL_REDO_WAIT_TIME,
@@ -50,6 +49,7 @@ use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
 use crate::repository::Key;
 use crate::task_mgr::BACKGROUND_RUNTIME;
 use crate::walrecord::NeonWalRecord;
+use crate::{config::PageServerConf, TEMP_FILE_SUFFIX};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
@@ -66,7 +66,7 @@ use postgres_ffi::BLCKSZ;
 /// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
 ///
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)]
-pub(crate) struct BufferTag {
+pub struct BufferTag {
    pub rel: RelTag,
    pub blknum: u32,
 }
@@ -89,7 +89,7 @@ pub trait WalRedoManager: Send + Sync {
        base_img: Option<(Lsn, Bytes)>,
        records: Vec<(Lsn, NeonWalRecord)>,
        pg_version: u32,
-    ) -> anyhow::Result<Bytes>;
+    ) -> Result<Bytes, WalRedoError>;
 }

 struct ProcessInput {
@@ -140,6 +140,20 @@ fn can_apply_in_neon(rec: &NeonWalRecord) -> bool {
    }
 }

+/// An error happened in WAL redo
+#[derive(Debug, thiserror::Error)]
+pub enum WalRedoError {
+    #[error(transparent)]
+    IoError(#[from] std::io::Error),
+
+    #[error("cannot perform WAL redo now")]
+    InvalidState,
+    #[error("cannot perform WAL redo for this request")]
+    InvalidRequest,
+    #[error("cannot perform WAL redo for this record")]
+    InvalidRecord,
+}
+
 ///
 /// Public interface of WAL redo manager
 ///
@@ -157,9 +171,10 @@ impl WalRedoManager for PostgresRedoManager {
        base_img: Option<(Lsn, Bytes)>,
        records: Vec<(Lsn, NeonWalRecord)>,
        pg_version: u32,
-    ) -> anyhow::Result<Bytes> {
+    ) -> Result<Bytes, WalRedoError> {
        if records.is_empty() {
-            anyhow::bail!("invalid WAL redo request with no records");
+            error!("invalid WAL redo request with no records");
+            return Err(WalRedoError::InvalidRequest);
        }

        let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
@@ -223,6 +238,15 @@ impl PostgresRedoManager {
        }
    }

+    /// Launch process pre-emptively. Should not be needed except for benchmarking.
+    pub fn launch_process(&self, pg_version: u32) -> anyhow::Result<()> {
+        let mut proc = self.stdin.lock().unwrap();
+        if proc.is_none() {
+            self.launch(&mut proc, pg_version)?;
+        }
+        Ok(())
+    }
+
    ///
    /// Process one request for WAL redo using wal-redo postgres
    ///
@@ -236,8 +260,8 @@ impl PostgresRedoManager {
        records: &[(Lsn, NeonWalRecord)],
        wal_redo_timeout: Duration,
        pg_version: u32,
-    ) -> anyhow::Result<Bytes> {
-        let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
+    ) -> Result<Bytes, WalRedoError> {
+        let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?;
        const MAX_RETRY_ATTEMPTS: u32 = 1;
        let start_time = Instant::now();
        let mut n_attempts = 0u32;
@@ -247,8 +271,7 @@ impl PostgresRedoManager {

            // launch the WAL redo process on first use
            if proc.is_none() {
-                self.launch(&mut proc, pg_version)
-                    .context("launch process")?;
+                self.launch(&mut proc, pg_version)?;
            }
            WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());

@@ -256,7 +279,7 @@ impl PostgresRedoManager {
            let buf_tag = BufferTag { rel, blknum };
            let result = self
                .apply_wal_records(proc, buf_tag, &base_img, records, wal_redo_timeout)
-                .context("apply_wal_records");
+                .map_err(WalRedoError::IoError);

            let end_time = Instant::now();
            let duration = end_time.duration_since(lock_time);
@@ -286,15 +309,15 @@ impl PostgresRedoManager {
            // next request will launch a new one.
            if let Err(e) = result.as_ref() {
                error!(
-                    "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
+                    n_attempts,
+                    "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}: {}",
                    records.len(),
                    records.first().map(|p| p.0).unwrap_or(Lsn(0)),
                    records.last().map(|p| p.0).unwrap_or(Lsn(0)),
                    nbytes,
                    base_img_lsn,
                    lsn,
-                    n_attempts,
-                    e,
+                    utils::error::report_compact_sources(e),
                );
                // self.stdin only holds stdin & stderr as_raw_fd().
                // Dropping it as part of take() doesn't close them.
@@ -331,7 +354,7 @@ impl PostgresRedoManager {
        lsn: Lsn,
        base_img: Option<Bytes>,
        records: &[(Lsn, NeonWalRecord)],
-    ) -> anyhow::Result<Bytes> {
+    ) -> Result<Bytes, WalRedoError> {
        let start_time = Instant::now();

        let mut page = BytesMut::new();
@@ -340,7 +363,8 @@ impl PostgresRedoManager {
            page.extend_from_slice(&fpi[..]);
        } else {
            // All the current WAL record types that we can handle require a base image.
-            anyhow::bail!("invalid neon WAL redo request with no base image");
+            error!("invalid neon WAL redo request with no base image");
+            return Err(WalRedoError::InvalidRequest);
        }

        // Apply all the WAL records in the batch
@@ -368,13 +392,14 @@ impl PostgresRedoManager {
        page: &mut BytesMut,
        _record_lsn: Lsn,
        record: &NeonWalRecord,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalRedoError> {
        match record {
            NeonWalRecord::Postgres {
                will_init: _,
                rec: _,
            } => {
-                anyhow::bail!("tried to pass postgres wal record to neon WAL redo");
+                error!("tried to pass postgres wal record to neon WAL redo");
+                return Err(WalRedoError::InvalidRequest);
            }
            NeonWalRecord::ClearVisibilityMapFlags {
                new_heap_blkno,
@@ -382,7 +407,7 @@ impl PostgresRedoManager {
                flags,
            } => {
                // sanity check that this is modifying the correct relation
-                let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
+                let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?;
                assert!(
                    rel.forknum == VISIBILITYMAP_FORKNUM,
                    "ClearVisibilityMapFlags record on unexpected rel {}",
@@ -420,7 +445,7 @@ impl PostgresRedoManager {
            // same effects as the corresponding Postgres WAL redo function.
            NeonWalRecord::ClogSetCommitted { xids, timestamp } => {
                let (slru_kind, segno, blknum) =
-                    key_to_slru_block(key).context("invalid record")?;
+                    key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
                assert_eq!(
                    slru_kind,
                    SlruKind::Clog,
@@ -470,7 +495,7 @@ impl PostgresRedoManager {
            }
            NeonWalRecord::ClogSetAborted { xids } => {
                let (slru_kind, segno, blknum) =
-                    key_to_slru_block(key).context("invalid record")?;
+                    key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
                assert_eq!(
                    slru_kind,
                    SlruKind::Clog,
@@ -501,7 +526,7 @@ impl PostgresRedoManager {
            }
            NeonWalRecord::MultixactOffsetCreate { mid, moff } => {
                let (slru_kind, segno, blknum) =
-                    key_to_slru_block(key).context("invalid record")?;
+                    key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
                assert_eq!(
                    slru_kind,
                    SlruKind::MultiXactOffsets,
@@ -534,7 +559,7 @@ impl PostgresRedoManager {
            }
            NeonWalRecord::MultixactMembersCreate { moff, members } => {
                let (slru_kind, segno, blknum) =
-                    key_to_slru_block(key).context("invalid record")?;
+                    key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
                assert_eq!(
                    slru_kind,
                    SlruKind::MultiXactMembers,
@@ -624,6 +649,26 @@ impl PostgresRedoManager {
        input: &mut MutexGuard<Option<ProcessInput>>,
        pg_version: u32,
    ) -> Result<(), Error> {
+        // Previous versions of wal-redo required data directory and that directories
+        // occupied some space on disk. Remove it if we face it.
+        //
+        // This code could be dropped after one release cycle.
+        let legacy_datadir = path_with_suffix_extension(
+            self.conf
+                .tenant_path(&self.tenant_id)
+                .join("wal-redo-datadir"),
+            TEMP_FILE_SUFFIX,
+        );
+        if legacy_datadir.exists() {
+            info!("legacy wal-redo datadir {legacy_datadir:?} exists, removing");
+            fs::remove_dir_all(&legacy_datadir).map_err(|e| {
+                Error::new(
+                    e.kind(),
+                    format!("legacy wal-redo datadir {legacy_datadir:?} removal failure: {e}"),
+                )
+            })?;
+        }
+
        let pg_bin_dir_path = self
            .conf
            .pg_bin_dir(pg_version)
@@ -714,7 +759,7 @@ impl PostgresRedoManager {
        base_img: &Option<Bytes>,
        records: &[(Lsn, NeonWalRecord)],
        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
+    ) -> Result<Bytes, std::io::Error> {
        // Serialize all the messages to send the WAL redo process first.
        //
        // This could be problematic if there are millions of records to replay,
@@ -737,7 +782,10 @@ impl PostgresRedoManager {
            {
                build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
            } else {
-                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+                return Err(Error::new(
+                    ErrorKind::Other,
+                    "tried to pass neon wal record to postgres WAL redo",
+                ));
            }
        }
        build_get_page_msg(tag, &mut writebuf);
@@ -759,7 +807,7 @@ impl PostgresRedoManager {
        writebuf: &[u8],
        mut input: MutexGuard<Option<ProcessInput>>,
        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
+    ) -> Result<Bytes, std::io::Error> {
        let proc = input.as_mut().unwrap();
        let mut nwrite = 0usize;
        let stdout_fd = proc.stdout_fd;
@@ -777,13 +825,13 @@ impl PostgresRedoManager {
        while nwrite < writebuf.len() {
            let n = loop {
                match nix::poll::poll(&mut pollfds[0..2], wal_redo_timeout.as_millis() as i32) {
-                    Err(nix::errno::Errno::EINTR) => continue,
+                    Err(e) if e == nix::errno::Errno::EINTR => continue,
                    res => break res,
                }
            }?;

            if n == 0 {
-                anyhow::bail!("WAL redo timed out");
+                return Err(Error::new(ErrorKind::Other, "WAL redo timed out"));
            }

            // If we have some messages in stderr, forward them to the log.
@@ -807,7 +855,10 @@ impl PostgresRedoManager {
                    continue;
                }
            } else if err_revents.contains(PollFlags::POLLHUP) {
-                anyhow::bail!("WAL redo process closed its stderr unexpectedly");
+                return Err(Error::new(
+                    ErrorKind::BrokenPipe,
+                    "WAL redo process closed its stderr unexpectedly",
+                ));
            }

            // If 'stdin' is writeable, do write.
@@ -816,7 +867,10 @@ impl PostgresRedoManager {
                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
            } else if in_revents.contains(PollFlags::POLLHUP) {
                // We still have more data to write, but the process closed the pipe.
-                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
+                return Err(Error::new(
+                    ErrorKind::BrokenPipe,
+                    "WAL redo process closed its stdin unexpectedly",
+                ));
            }
        }
        let request_no = proc.n_requests;
@@ -847,7 +901,10 @@ impl PostgresRedoManager {
            //
            // Cross-read this with the comment in apply_batch_postgres if result.is_err().
            // That's where we kill the child process.
-            anyhow::bail!("WAL redo process closed its stdout unexpectedly");
+            return Err(Error::new(
+                ErrorKind::BrokenPipe,
+                "WAL redo process closed its stdout unexpectedly",
+            ));
        }
        let n_processed_responses = output.n_processed_responses;
        while n_processed_responses + output.pending_responses.len() <= request_no {
@@ -860,13 +917,13 @@ impl PostgresRedoManager {
                // and forward any logging information that the child writes to its stderr to the page server's log.
                let n = loop {
                    match nix::poll::poll(&mut pollfds[1..3], wal_redo_timeout.as_millis() as i32) {
-                        Err(nix::errno::Errno::EINTR) => continue,
+                        Err(e) if e == nix::errno::Errno::EINTR => continue,
                        res => break res,
                    }
                }?;

                if n == 0 {
-                    anyhow::bail!("WAL redo timed out");
+                    return Err(Error::new(ErrorKind::Other, "WAL redo timed out"));
                }

                // If we have some messages in stderr, forward them to the log.
@@ -890,7 +947,10 @@ impl PostgresRedoManager {
                        continue;
                    }
                } else if err_revents.contains(PollFlags::POLLHUP) {
-                    anyhow::bail!("WAL redo process closed its stderr unexpectedly");
+                    return Err(Error::new(
+                        ErrorKind::BrokenPipe,
+                        "WAL redo process closed its stderr unexpectedly",
+                    ));
                }

                // If we have some data in stdout, read it to the result buffer.
@@ -898,7 +958,10 @@ impl PostgresRedoManager {
                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
                } else if out_revents.contains(PollFlags::POLLHUP) {
-                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
+                    return Err(Error::new(
+                        ErrorKind::BrokenPipe,
+                        "WAL redo process closed its stdout unexpectedly",
+                    ));
                }
            }
            output
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -741,6 +741,13 @@ NeonProcessUtility(
 			break;
 		case T_DropdbStmt:
 			HandleDropDb(castNode(DropdbStmt, parseTree));
+			/*
+			 * We do this here to hack around the fact that Postgres performs the drop
+			 * INSIDE of standard_ProcessUtility, which means that if we try to
+			 * abort the drop normally it'll be too late. DROP DATABASE can't be inside
+			 * of a transaction block anyway, so this should be fine to do.
+			 */
+			NeonXactCallback(XACT_EVENT_PRE_COMMIT, NULL);
 			break;
 		case T_CreateRoleStmt:
 			HandleCreateRole(castNode(CreateRoleStmt, parseTree));
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -721,7 +721,7 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls

 	/* use an intermediate PrefetchRequest struct to ensure correct alignment */
 	req.buftag = tag;
-  Retry:
+	
 	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req);

 	if (entry != NULL)
@@ -858,11 +858,7 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 	if (flush_every_n_requests > 0 &&
 		MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
 	{
-		if (!page_server->flush())
-		{
-			/* Prefetch set is reset in case of error, so we should try to register our request once again */
-			goto Retry;
-		}
+		page_server->flush();
 		MyPState->ring_flush = MyPState->ring_unused;
 	}

--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,6 +1,5 @@
 use futures::future::Either;
 use proxy::auth;
-use proxy::config::HttpConfig;
 use proxy::console;
 use proxy::http;
 use proxy::metrics;
@@ -80,9 +79,6 @@ struct ProxyCliArgs {
    /// Allow self-signed certificates for compute nodes (for testing)
    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    allow_self_signed_compute: bool,
-    /// timeout for http connections
-    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
-    sql_over_http_timeout: tokio::time::Duration,
 }

 #[tokio::main]
@@ -224,15 +220,12 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
            auth::BackendType::Link(Cow::Owned(url))
        }
    };
-    let http_config = HttpConfig {
-        sql_over_http_timeout: args.sql_over_http_timeout,
-    };
+
    let config = Box::leak(Box::new(ProxyConfig {
        tls_config,
        auth_backend,
        metric_collection,
        allow_self_signed_compute: args.allow_self_signed_compute,
-        http_config,
    }));

    Ok(config)
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -13,7 +13,6 @@ pub struct ProxyConfig {
    pub auth_backend: auth::BackendType<'static, ()>,
    pub metric_collection: Option<MetricCollectionConfig>,
    pub allow_self_signed_compute: bool,
-    pub http_config: HttpConfig,
 }

 #[derive(Debug)]
@@ -27,10 +26,6 @@ pub struct TlsConfig {
    pub common_names: Option<HashSet<String>>,
 }

-pub struct HttpConfig {
-    pub sql_over_http_timeout: tokio::time::Duration,
-}
-
 impl TlsConfig {
    pub fn to_server_config(&self) -> Arc<rustls::ServerConfig> {
        self.config.clone()
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -89,10 +89,7 @@ pub mod errors {
                Self::Console {
                    status: http::StatusCode::LOCKED,
                    ref text,
-                } => {
-                    !text.contains("written data quota exceeded")
-                        && !text.contains("the limit for current plan reached")
-                }
+                } => !text.contains("quota"),
                // retry server errors
                Self::Console { status, .. } if status.is_server_error() => true,
                _ => false,
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -20,7 +20,6 @@ use tokio_postgres::AsyncMessage;
 use crate::{
    auth, console,
    metrics::{Ids, MetricCounter, USAGE_METRICS},
-    proxy::{LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER, NUM_DB_CONNECTIONS_OPENED_COUNTER},
 };
 use crate::{compute, config};

@@ -139,7 +138,6 @@ impl GlobalConnPool {
        session_id: uuid::Uuid,
    ) -> anyhow::Result<Client> {
        let mut client: Option<Client> = None;
-        let mut latency_timer = LatencyTimer::new("http");

        let mut hash_valid = false;
        if !force_new {
@@ -183,16 +181,15 @@ impl GlobalConnPool {
        let new_client = if let Some(client) = client {
            if client.inner.is_closed() {
                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
-                connect_to_compute(self.proxy_config, conn_info, session_id, latency_timer).await
+                connect_to_compute(self.proxy_config, conn_info, session_id).await
            } else {
-                latency_timer.pool_hit();
                info!("pool: reusing connection '{conn_info}'");
                client.session.send(session_id)?;
                return Ok(client);
            }
        } else {
            info!("pool: opening a new connection '{conn_info}'");
-            connect_to_compute(self.proxy_config, conn_info, session_id, latency_timer).await
+            connect_to_compute(self.proxy_config, conn_info, session_id).await
        };

        match &new_client {
@@ -349,7 +346,6 @@ async fn connect_to_compute(
    config: &config::ProxyConfig,
    conn_info: &ConnInfo,
    session_id: uuid::Uuid,
-    latency_timer: LatencyTimer,
 ) -> anyhow::Result<Client> {
    let tls = config.tls_config.as_ref();
    let common_names = tls.and_then(|tls| tls.common_names.clone());
@@ -389,7 +385,6 @@ async fn connect_to_compute(
        node_info,
        &extra,
        &creds,
-        latency_timer,
    )
    .await
 }
@@ -423,42 +418,36 @@ async fn connect_to_compute_once(
    };

    tokio::spawn(
-        async move {
-            NUM_DB_CONNECTIONS_OPENED_COUNTER.with_label_values(&["http"]).inc();
-            scopeguard::defer! {
-                NUM_DB_CONNECTIONS_CLOSED_COUNTER.with_label_values(&["http"]).inc();
+        poll_fn(move |cx| {
+            if matches!(rx.has_changed(), Ok(true)) {
+                session = *rx.borrow_and_update();
+                info!(%session, "changed session");
            }
-            poll_fn(move |cx| {
-                if matches!(rx.has_changed(), Ok(true)) {
-                    session = *rx.borrow_and_update();
-                    info!(%session, "changed session");
-                }

-                loop {
-                    let message = ready!(connection.poll_message(cx));
+            loop {
+                let message = ready!(connection.poll_message(cx));

-                    match message {
-                        Some(Ok(AsyncMessage::Notice(notice))) => {
-                            info!(%session, "notice: {}", notice);
-                        }
-                        Some(Ok(AsyncMessage::Notification(notif))) => {
-                            warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
-                        }
-                        Some(Ok(_)) => {
-                            warn!(%session, "unknown message");
-                        }
-                        Some(Err(e)) => {
-                            error!(%session, "connection error: {}", e);
-                            return Poll::Ready(())
-                        }
-                        None => {
-                            info!("connection closed");
-                            return Poll::Ready(())
-                        }
+                match message {
+                    Some(Ok(AsyncMessage::Notice(notice))) => {
+                        info!(%session, "notice: {}", notice);
+                    }
+                    Some(Ok(AsyncMessage::Notification(notif))) => {
+                        warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
+                    }
+                    Some(Ok(_)) => {
+                        warn!(%session, "unknown message");
+                    }
+                    Some(Err(e)) => {
+                        error!(%session, "connection error: {}", e);
+                        return Poll::Ready(())
+                    }
+                    None => {
+                        info!("connection closed");
+                        return Poll::Ready(())
                    }
                }
-            }).await
-        }
+            }
+        })
        .instrument(span)
    );

--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -24,9 +24,6 @@ use url::Url;
 use utils::http::error::ApiError;
 use utils::http::json::json_response;

-use crate::config::HttpConfig;
-use crate::proxy::{NUM_CONNECTIONS_ACCEPTED_COUNTER, NUM_CONNECTIONS_CLOSED_COUNTER};
-
 use super::conn_pool::ConnInfo;
 use super::conn_pool::GlobalConnPool;

@@ -102,9 +99,9 @@ fn json_array_to_pg_array(value: &Value) -> Result<Option<String>, serde_json::E
        // convert to text with escaping
        Value::Bool(_) => serde_json::to_string(value).map(Some),
        Value::Number(_) => serde_json::to_string(value).map(Some),
+        Value::Object(_) => serde_json::to_string(value).map(Some),

        // here string needs to be escaped, as it is part of the array
-        Value::Object(_) => json_array_to_pg_array(&Value::String(serde_json::to_string(value)?)),
        Value::String(_) => serde_json::to_string(value).map(Some),

        // recurse into array
@@ -191,46 +188,28 @@ pub async fn handle(
    sni_hostname: Option<String>,
    conn_pool: Arc<GlobalConnPool>,
    session_id: uuid::Uuid,
-    config: &'static HttpConfig,
 ) -> Result<Response<Body>, ApiError> {
-    let result = tokio::time::timeout(
-        config.sql_over_http_timeout,
-        handle_inner(request, sni_hostname, conn_pool, session_id),
-    )
-    .await;
+    let result = handle_inner(request, sni_hostname, conn_pool, session_id).await;
+
    let mut response = match result {
-        Ok(r) => match r {
-            Ok(r) => r,
-            Err(e) => {
-                let message = format!("{:?}", e);
-                let code = e.downcast_ref::<tokio_postgres::Error>().and_then(|e| {
-                    e.code()
-                        .map(|s| serde_json::to_value(s.code()).unwrap_or_default())
-                });
-                let code = match code {
-                    Some(c) => c,
+        Ok(r) => r,
+        Err(e) => {
+            let message = format!("{:?}", e);
+            let code = match e.downcast_ref::<tokio_postgres::Error>() {
+                Some(e) => match e.code() {
+                    Some(e) => serde_json::to_value(e.code()).unwrap(),
                    None => Value::Null,
-                };
-                error!(
-                    ?code,
-                    "sql-over-http per-client task finished with an error: {e:#}"
-                );
-                // TODO: this shouldn't always be bad request.
-                json_response(
-                    StatusCode::BAD_REQUEST,
-                    json!({ "message": message, "code": code }),
-                )?
-            }
-        },
-        Err(_) => {
-            let message = format!(
-                "HTTP-Connection timed out, execution time exeeded {} seconds",
-                config.sql_over_http_timeout.as_secs()
+                },
+                None => Value::Null,
+            };
+            error!(
+                ?code,
+                "sql-over-http per-client task finished with an error: {e:#}"
            );
-            error!(message);
+            // TODO: this shouldn't always be bad request.
            json_response(
-                StatusCode::GATEWAY_TIMEOUT,
-                json!({ "message": message, "code": StatusCode::GATEWAY_TIMEOUT.as_u16() }),
+                StatusCode::BAD_REQUEST,
+                json!({ "message": message, "code": code }),
            )?
        }
    };
@@ -248,13 +227,6 @@ async fn handle_inner(
    conn_pool: Arc<GlobalConnPool>,
    session_id: uuid::Uuid,
 ) -> anyhow::Result<Response<Body>> {
-    NUM_CONNECTIONS_ACCEPTED_COUNTER
-        .with_label_values(&["http"])
-        .inc();
-    scopeguard::defer! {
-        NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&["http"]).inc();
-    }
-
    //
    // Determine the destination and connection params
    //
@@ -613,7 +585,7 @@ fn _pg_array_parse(
                    }
                }
            }
-            '}' if !quote => {
+            '}' => {
                level -= 1;
                if level == 0 {
                    push_checked(&mut entry, &mut entries, elem_type)?;
@@ -697,14 +669,6 @@ mod tests {
                "{{true,false},{NULL,42},{\"foo\",\"bar\\\"-\\\\\"}}".to_owned()
            )]
        );
-        // array of objects
-        let json = r#"[{"foo": 1},{"bar": 2}]"#;
-        let json: Value = serde_json::from_str(json).unwrap();
-        let pg_params = json_to_pg_text(vec![json]).unwrap();
-        assert_eq!(
-            pg_params,
-            vec![Some(r#"{"{\"foo\":1}","{\"bar\":2}"}"#.to_owned())]
-        );
    }

    #[test]
@@ -832,23 +796,4 @@ mod tests {
            json!([[[1, 2, 3], [4, 5, 6]]])
        );
    }
-    #[test]
-    fn test_pg_array_parse_json() {
-        fn pt(pg_arr: &str) -> Value {
-            pg_array_parse(pg_arr, &Type::JSONB).unwrap()
-        }
-        assert_eq!(pt(r#"{"{}"}"#), json!([{}]));
-        assert_eq!(
-            pt(r#"{"{\"foo\": 1, \"bar\": 2}"}"#),
-            json!([{"foo": 1, "bar": 2}])
-        );
-        assert_eq!(
-            pt(r#"{"{\"foo\": 1}", "{\"bar\": 2}"}"#),
-            json!([{"foo": 1}, {"bar": 2}])
-        );
-        assert_eq!(
-            pt(r#"{{"{\"foo\": 1}", "{\"bar\": 2}"}}"#),
-            json!([[{"foo": 1}, {"bar": 2}]])
-        );
-    }
 }
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -3,10 +3,7 @@ use crate::{
    config::ProxyConfig,
    error::io_error,
    protocol2::{ProxyProtocolAccept, WithClientIp},
-    proxy::{
-        handle_client, ClientMode, NUM_CLIENT_CONNECTION_CLOSED_COUNTER,
-        NUM_CLIENT_CONNECTION_OPENED_COUNTER,
-    },
+    proxy::{handle_client, ClientMode},
 };
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream, StreamExt};
@@ -205,14 +202,7 @@ async fn ws_handler(
    // TODO: that deserves a refactor as now this function also handles http json client besides websockets.
    // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
-        sql_over_http::handle(
-            request,
-            sni_hostname,
-            conn_pool,
-            session_id,
-            &config.http_config,
-        )
-        .await
+        sql_over_http::handle(request, sni_hostname, conn_pool, session_id).await
    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
        Response::builder()
            .header("Allow", "OPTIONS, POST")
@@ -285,25 +275,23 @@ pub async fn task_main(
            let conn_pool = conn_pool.clone();

            async move {
-                Ok::<_, Infallible>(MetricService::new(hyper::service::service_fn(
-                    move |req: Request<Body>| {
-                        let sni_name = sni_name.clone();
-                        let conn_pool = conn_pool.clone();
+                Ok::<_, Infallible>(hyper::service::service_fn(move |req: Request<Body>| {
+                    let sni_name = sni_name.clone();
+                    let conn_pool = conn_pool.clone();

-                        async move {
-                            let cancel_map = Arc::new(CancelMap::default());
-                            let session_id = uuid::Uuid::new_v4();
+                    async move {
+                        let cancel_map = Arc::new(CancelMap::default());
+                        let session_id = uuid::Uuid::new_v4();

-                            ws_handler(req, config, conn_pool, cancel_map, session_id, sni_name)
-                                .instrument(info_span!(
-                                    "ws-client",
-                                    session = %session_id,
-                                    %peer_addr,
-                                ))
-                                .await
-                        }
-                    },
-                )))
+                        ws_handler(req, config, conn_pool, cancel_map, session_id, sni_name)
+                            .instrument(info_span!(
+                                "ws-client",
+                                session = %session_id,
+                                %peer_addr,
+                            ))
+                            .await
+                    }
+                }))
            }
        },
    );
@@ -315,41 +303,3 @@ pub async fn task_main(

    Ok(())
 }
-
-struct MetricService<S> {
-    inner: S,
-}
-
-impl<S> MetricService<S> {
-    fn new(inner: S) -> MetricService<S> {
-        NUM_CLIENT_CONNECTION_OPENED_COUNTER
-            .with_label_values(&["http"])
-            .inc();
-        MetricService { inner }
-    }
-}
-
-impl<S> Drop for MetricService<S> {
-    fn drop(&mut self) {
-        NUM_CLIENT_CONNECTION_CLOSED_COUNTER
-            .with_label_values(&["http"])
-            .inc();
-    }
-}
-
-impl<S, ReqBody> hyper::service::Service<Request<ReqBody>> for MetricService<S>
-where
-    S: hyper::service::Service<Request<ReqBody>>,
-{
-    type Response = S::Response;
-    type Error = S::Error;
-    type Future = S::Future;
-
-    fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
-        self.inner.poll_ready(cx)
-    }
-
-    fn call(&mut self, req: Request<ReqBody>) -> Self::Future {
-        self.inner.call(req)
-    }
-}
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -7,7 +7,6 @@ use crate::{
    compute::{self, PostgresConnection},
    config::{ProxyConfig, TlsConfig},
    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
-    http::StatusCode,
    metrics::{Ids, USAGE_METRICS},
    protocol2::WithClientIp,
    stream::{PqStream, Stream},
@@ -15,11 +14,12 @@ use crate::{
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use metrics::{exponential_buckets, register_int_counter_vec, IntCounterVec};
+use metrics::{
+    exponential_buckets, register_histogram, register_int_counter_vec, Histogram, IntCounterVec,
+};
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
-use prometheus::{register_histogram_vec, HistogramVec};
-use std::{error::Error, io, ops::ControlFlow, sync::Arc, time::Instant};
+use std::{error::Error, io, ops::ControlFlow, sync::Arc};
 use tokio::{
    io::{AsyncRead, AsyncWrite, AsyncWriteExt},
    time,
@@ -38,111 +38,34 @@ const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";

-pub static NUM_DB_CONNECTIONS_OPENED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_opened_db_connections_total",
-        "Number of opened connections to a database.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-pub static NUM_DB_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_closed_db_connections_total",
-        "Number of closed connections to a database.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-pub static NUM_CLIENT_CONNECTION_OPENED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_opened_client_connections_total",
-        "Number of opened connections from a client.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-pub static NUM_CLIENT_CONNECTION_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_closed_client_connections_total",
-        "Number of closed connections from a client.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-pub static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "proxy_accepted_connections_total",
-        "Number of client connections accepted.",
+        "Number of TCP client connections accepted.",
        &["protocol"],
    )
    .unwrap()
 });

-pub static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "proxy_closed_connections_total",
-        "Number of client connections closed.",
+        "Number of TCP client connections closed.",
        &["protocol"],
    )
    .unwrap()
 });

-static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
+static COMPUTE_CONNECTION_LATENCY: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
        "proxy_compute_connection_latency_seconds",
        "Time it took for proxy to establish a connection to the compute endpoint",
-        &["protocol", "cache_miss", "pool_miss"],
        // largest bucket = 2^16 * 0.5ms = 32s
        exponential_buckets(0.0005, 2.0, 16).unwrap(),
    )
    .unwrap()
 });

-pub struct LatencyTimer {
-    start: Instant,
-    pool_miss: bool,
-    cache_miss: bool,
-    protocol: &'static str,
-}
-
-impl LatencyTimer {
-    pub fn new(protocol: &'static str) -> Self {
-        Self {
-            start: Instant::now(),
-            cache_miss: false,
-            // by default we don't do pooling
-            pool_miss: true,
-            protocol,
-        }
-    }
-
-    pub fn cache_miss(&mut self) {
-        self.cache_miss = true;
-    }
-
-    pub fn pool_hit(&mut self) {
-        self.pool_miss = false;
-    }
-}
-
-impl Drop for LatencyTimer {
-    fn drop(&mut self) {
-        let duration = self.start.elapsed().as_secs_f64();
-        COMPUTE_CONNECTION_LATENCY
-            .with_label_values(&[
-                self.protocol,
-                bool_to_str(self.cache_miss),
-                bool_to_str(self.pool_miss),
-            ])
-            .observe(duration)
-    }
-}
-
 static NUM_CONNECTION_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "proxy_connection_failures_total",
@@ -152,15 +75,6 @@ static NUM_CONNECTION_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
    .unwrap()
 });

-static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_connection_failures_breakdown",
-        "Number of wake-up failures (per kind).",
-        &["retry", "kind"],
-    )
-    .unwrap()
-});
-
 static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "proxy_io_bytes_per_client",
@@ -294,16 +208,12 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
        "handling interactive connection from client"
    );

-    let proto = mode.protocol_label();
-    NUM_CLIENT_CONNECTION_OPENED_COUNTER
-        .with_label_values(&[proto])
-        .inc();
+    // The `closed` counter will increase when this future is destroyed.
    NUM_CONNECTIONS_ACCEPTED_COUNTER
-        .with_label_values(&[proto])
+        .with_label_values(&[mode.protocol_label()])
        .inc();
    scopeguard::defer! {
-        NUM_CLIENT_CONNECTION_CLOSED_COUNTER.with_label_values(&[proto]).inc();
-        NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[proto]).inc();
+        NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[mode.protocol_label()]).inc();
    }

    let tls = config.tls_config.as_ref();
@@ -338,7 +248,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
        mode.allow_self_signed_compute(config),
    );
    cancel_map
-        .with_session(|session| client.connect_to_db(session, mode))
+        .with_session(|session| client.connect_to_db(session, mode.allow_cleartext()))
        .await
 }

@@ -487,46 +397,6 @@ impl ConnectMechanism for TcpMechanism<'_> {
    }
 }

-const fn bool_to_str(x: bool) -> &'static str {
-    if x {
-        "true"
-    } else {
-        "false"
-    }
-}
-
-fn report_error(e: &WakeComputeError, retry: bool) {
-    use crate::console::errors::ApiError;
-    let retry = bool_to_str(retry);
-    let kind = match e {
-        WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
-        WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::LOCKED,
-            ref text,
-        }) if text.contains("written data quota exceeded")
-            || text.contains("the limit for current plan reached") =>
-        {
-            "quota_exceeded"
-        }
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::LOCKED,
-            ..
-        }) => "api_console_locked",
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::BAD_REQUEST,
-            ..
-        }) => "api_console_bad_request",
-        WakeComputeError::ApiError(ApiError::Console { status, .. })
-            if status.is_server_error() =>
-        {
-            "api_console_other_server_error"
-        }
-        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
-    };
-    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
-}
-
 /// Try to connect to the compute node, retrying if necessary.
 /// This function might update `node_info`, so we take it by `&mut`.
 #[tracing::instrument(skip_all)]
@@ -535,12 +405,13 @@ pub async fn connect_to_compute<M: ConnectMechanism>(
    mut node_info: console::CachedNodeInfo,
    extra: &console::ConsoleReqExtra<'_>,
    creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
-    mut latency_timer: LatencyTimer,
 ) -> Result<M::Connection, M::Error>
 where
    M::ConnectError: ShouldRetry + std::fmt::Debug,
    M::Error: From<WakeComputeError>,
 {
+    let _timer = COMPUTE_CONNECTION_LATENCY.start_timer();
+
    mechanism.update_connect_config(&mut node_info.config);

    // try once
@@ -552,8 +423,6 @@ where
        }
    };

-    latency_timer.cache_miss();
-
    let mut num_retries = 1;

    // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
@@ -571,12 +440,10 @@ where
        match handle_try_wake(wake_res, num_retries) {
            Err(e) => {
                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
-                report_error(&e, false);
                return Err(e.into());
            }
            // failed to wake up but we can continue to retry
            Ok(ControlFlow::Continue(e)) => {
-                report_error(&e, true);
                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
            }
            // successfully woke up a compute node and can break the wakeup loop
@@ -815,7 +682,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
    async fn connect_to_db(
        self,
        session: cancellation::Session<'_>,
-        mode: ClientMode,
+        allow_cleartext: bool,
    ) -> anyhow::Result<()> {
        let Self {
            mut stream,
@@ -830,10 +697,8 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
            application_name: params.get("application_name"),
        };

-        let latency_timer = LatencyTimer::new(mode.protocol_label());
-
        let auth_result = match creds
-            .authenticate(&extra, &mut stream, mode.allow_cleartext())
+            .authenticate(&extra, &mut stream, allow_cleartext)
            .await
        {
            Ok(auth_result) => auth_result,
@@ -855,23 +720,9 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
        node_info.allow_self_signed_compute = allow_self_signed_compute;

        let aux = node_info.aux.clone();
-        let mut node = connect_to_compute(
-            &TcpMechanism { params },
-            node_info,
-            &extra,
-            &creds,
-            latency_timer,
-        )
-        .or_else(|e| stream.throw_error(e))
-        .await?;
-
-        let proto = mode.protocol_label();
-        NUM_DB_CONNECTIONS_OPENED_COUNTER
-            .with_label_values(&[proto])
-            .inc();
-        scopeguard::defer! {
-            NUM_DB_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[proto]).inc();
-        }
+        let mut node = connect_to_compute(&TcpMechanism { params }, node_info, &extra, &creds)
+            .or_else(|e| stream.throw_error(e))
+            .await?;

        prepare_client_connection(&node, reported_auth_ok, session, &mut stream).await?;
        // Before proxy passing, forward to compute whatever data is left in the
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -450,7 +450,7 @@ async fn connect_to_compute_success() {
    use ConnectAction::*;
    let mechanism = TestConnectMechanism::new(vec![Connect]);
    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test"))
+    connect_to_compute(&mechanism, cache, &extra, &creds)
        .await
        .unwrap();
    mechanism.verify();
@@ -461,7 +461,7 @@ async fn connect_to_compute_retry() {
    use ConnectAction::*;
    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Connect]);
    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test"))
+    connect_to_compute(&mechanism, cache, &extra, &creds)
        .await
        .unwrap();
    mechanism.verify();
@@ -473,7 +473,7 @@ async fn connect_to_compute_non_retry_1() {
    use ConnectAction::*;
    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Fail]);
    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test"))
+    connect_to_compute(&mechanism, cache, &extra, &creds)
        .await
        .unwrap_err();
    mechanism.verify();
@@ -485,7 +485,7 @@ async fn connect_to_compute_non_retry_2() {
    use ConnectAction::*;
    let mechanism = TestConnectMechanism::new(vec![Fail, Wake, Retry, Connect]);
    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test"))
+    connect_to_compute(&mechanism, cache, &extra, &creds)
        .await
        .unwrap();
    mechanism.verify();
@@ -501,7 +501,7 @@ async fn connect_to_compute_non_retry_3() {
        Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
    ]);
    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test"))
+    connect_to_compute(&mechanism, cache, &extra, &creds)
        .await
        .unwrap_err();
    mechanism.verify();
@@ -513,7 +513,7 @@ async fn wake_retry() {
    use ConnectAction::*;
    let mechanism = TestConnectMechanism::new(vec![Retry, WakeRetry, Wake, Connect]);
    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test"))
+    connect_to_compute(&mechanism, cache, &extra, &creds)
        .await
        .unwrap();
    mechanism.verify();
@@ -525,7 +525,7 @@ async fn wake_non_retry() {
    use ConnectAction::*;
    let mechanism = TestConnectMechanism::new(vec![Retry, WakeFail]);
    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test"))
+    connect_to_compute(&mechanism, cache, &extra, &creds)
        .await
        .unwrap_err();
    mechanism.verify();
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -374,12 +374,8 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
    if conf.http_auth.is_some() {
        router = router.middleware(auth_middleware(|request| {
            #[allow(clippy::mutable_key_type)]
-            static ALLOWLIST_ROUTES: Lazy<HashSet<Uri>> = Lazy::new(|| {
-                ["/v1/status", "/metrics"]
-                    .iter()
-                    .map(|v| v.parse().unwrap())
-                    .collect()
-            });
+            static ALLOWLIST_ROUTES: Lazy<HashSet<Uri>> =
+                Lazy::new(|| ["/v1/status"].iter().map(|v| v.parse().unwrap()).collect());
            if ALLOWLIST_ROUTES.contains(request.uri()) {
                None
            } else {
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -723,9 +723,9 @@ impl Timeline {
            if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
                return Ok(()); // nothing to do
            }
-
+            let remover = shared_state.sk.wal_store.remove_up_to(horizon_segno - 1);
            // release the lock before removing
-            shared_state.sk.wal_store.remove_up_to(horizon_segno - 1)
+            remover
        };

        // delete old WAL files
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1464,29 +1464,6 @@ class NeonCli(AbstractNeonCli):

        return self.raw_cli(args, check_return_code=check_return_code)

-    def map_branch(
-        self, name: str, tenant_id: TenantId, timeline_id: TimelineId
-    ) -> "subprocess.CompletedProcess[str]":
-        """
-        Map tenant id and timeline id to a neon_local branch name. They do not have to exist.
-        Usually needed when creating branches via PageserverHttpClient and not neon_local.
-
-        After creating a name mapping, you can use EndpointFactory.create_start
-        with this registered branch name.
-        """
-        args = [
-            "mappings",
-            "map",
-            "--branch-name",
-            name,
-            "--tenant-id",
-            str(tenant_id),
-            "--timeline-id",
-            str(timeline_id),
-        ]
-
-        return self.raw_cli(args, check_return_code=True)
-
    def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]":
        return self.raw_cli(["start"], check_return_code=check_return_code)

--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -74,14 +74,11 @@ def wait_until_tenant_state(
    for _ in range(iterations):
        try:
            tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
-        except Exception as e:
-            log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
-        else:
            log.debug(f"Tenant {tenant_id} data: {tenant}")
            if tenant["state"]["slug"] == expected_state:
                return tenant
-            if tenant["state"]["slug"] == "Broken":
-                raise RuntimeError(f"tenant became Broken, not {expected_state}")
+        except Exception as e:
+            log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")

        time.sleep(period)

--- a/test_runner/performance/test_wal_backpressure.py
+++ b/test_runner/performance/test_wal_backpressure.py
@@ -65,7 +65,7 @@ def start_heavy_write_workload(env: PgCompare, n_tables: int, scale: int, num_it

    def start_single_table_workload(table_id: int):
        for _ in range(num_iters):
-            with env.pg.connect(options="-cstatement_timeout=300s").cursor() as cur:
+            with env.pg.connect().cursor() as cur:
                cur.execute(
                    f"INSERT INTO t{table_id} SELECT FROM generate_series(1,{new_rows_each_update})"
                )
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -1,24 +1,14 @@
 import random
 import threading
 import time
-from queue import SimpleQueue
-from typing import Any, Dict, List, Union
+from typing import List

 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    Endpoint,
-    NeonEnv,
-    NeonEnvBuilder,
-    PgBin,
-)
-from fixtures.pageserver.http import PageserverApiException
-from fixtures.pageserver.utils import wait_until_tenant_active
-from fixtures.types import Lsn, TimelineId
+from fixtures.neon_fixtures import Endpoint, NeonEnv, PgBin
+from fixtures.types import Lsn
 from fixtures.utils import query_scalar
 from performance.test_perf_pgbench import get_scales_matrix
-from requests import RequestException
-from requests.exceptions import RetryError


 # Test branch creation
@@ -138,245 +128,3 @@ def test_branching_unnormalized_start_lsn(neon_simple_env: NeonEnv, pg_bin: PgBi
    endpoint1 = env.endpoints.create_start("b1")

    pg_bin.run_capture(["pgbench", "-i", endpoint1.connstr()])
-
-
-def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonEnvBuilder):
-    """
-    Endpoint should not be possible to create because branch has not been uploaded.
-    """
-
-    env = neon_env_builder.init_configs()
-    env.start()
-
-    env.pageserver.allowed_errors.append(
-        ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
-    )
-    env.pageserver.allowed_errors.append(
-        ".*page_service_conn_main.*: query handler for 'basebackup .* is not active, state: Loading"
-    )
-    ps_http = env.pageserver.http_client()
-
-    # pause all uploads
-    ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
-    ps_http.tenant_create(env.initial_tenant)
-
-    initial_branch = "initial_branch"
-
-    def start_creating_timeline():
-        with pytest.raises(RequestException):
-            ps_http.timeline_create(
-                env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
-            )
-
-    t = threading.Thread(target=start_creating_timeline)
-    try:
-        t.start()
-
-        wait_until_paused(env, "before-upload-index-pausable")
-
-        env.neon_cli.map_branch(initial_branch, env.initial_tenant, env.initial_timeline)
-
-        with pytest.raises(RuntimeError, match="is not active, state: Loading"):
-            env.endpoints.create_start(initial_branch, tenant_id=env.initial_tenant)
-    finally:
-        # FIXME: paused uploads bother shutdown
-        env.pageserver.stop(immediate=True)
-
-        t.join()
-
-
-def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder):
-    """
-    Branch should not be possible to create because ancestor has not been uploaded.
-    """
-
-    env = neon_env_builder.init_configs()
-    env.start()
-
-    env.pageserver.allowed_errors.append(
-        ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
-    )
-    ps_http = env.pageserver.http_client()
-
-    # pause all uploads
-    ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
-    ps_http.tenant_create(env.initial_tenant)
-
-    def start_creating_timeline():
-        with pytest.raises(RequestException):
-            ps_http.timeline_create(
-                env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
-            )
-
-    t = threading.Thread(target=start_creating_timeline)
-    try:
-        t.start()
-
-        wait_until_paused(env, "before-upload-index-pausable")
-
-        branch_id = TimelineId.generate()
-
-        with pytest.raises(RetryError, match="too many 503 error responses"):
-            ps_http.timeline_create(
-                env.pg_version,
-                env.initial_tenant,
-                branch_id,
-                ancestor_timeline_id=env.initial_timeline,
-            )
-
-        with pytest.raises(
-            PageserverApiException,
-            match=f"NotFound: Timeline {env.initial_tenant}/{branch_id} was not found",
-        ):
-            ps_http.timeline_detail(env.initial_tenant, branch_id)
-            # important to note that a task might still be in progress to complete
-            # the work, but will never get to that because we have the pause
-            # failpoint
-    finally:
-        # FIXME: paused uploads bother shutdown
-        env.pageserver.stop(immediate=True)
-
-        t.join()
-
-
-def test_competing_branchings_from_loading_race_to_ok_or_err(neon_env_builder: NeonEnvBuilder):
-    """
-    If the activate only after upload is used, then retries could become competing.
-    """
-
-    env = neon_env_builder.init_configs()
-    env.start()
-
-    env.pageserver.allowed_errors.append(
-        ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
-    )
-    env.pageserver.allowed_errors.append(
-        ".*Error processing HTTP request: InternalServerError\\(Timeline .*/.* already exists in pageserver's memory"
-    )
-    ps_http = env.pageserver.http_client()
-
-    # pause all uploads
-    ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
-    ps_http.tenant_create(env.initial_tenant)
-
-    def start_creating_timeline():
-        ps_http.timeline_create(
-            env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
-        )
-
-    create_root = threading.Thread(target=start_creating_timeline)
-
-    branch_id = TimelineId.generate()
-
-    queue: SimpleQueue[Union[Dict[Any, Any], Exception]] = SimpleQueue()
-    barrier = threading.Barrier(3)
-
-    def try_branch():
-        barrier.wait()
-        barrier.wait()
-        try:
-            ret = ps_http.timeline_create(
-                env.pg_version,
-                env.initial_tenant,
-                branch_id,
-                ancestor_timeline_id=env.initial_timeline,
-                timeout=5,
-            )
-            queue.put(ret)
-        except Exception as e:
-            queue.put(e)
-
-    threads = [threading.Thread(target=try_branch) for _ in range(2)]
-
-    try:
-        create_root.start()
-
-        for t in threads:
-            t.start()
-
-        wait_until_paused(env, "before-upload-index-pausable")
-
-        barrier.wait()
-        ps_http.configure_failpoints(("before-upload-index-pausable", "off"))
-        barrier.wait()
-
-        # now both requests race to branch, only one can win because they take gc_cs, Tenant::timelines or marker files
-        first = queue.get()
-        second = queue.get()
-
-        log.info(first)
-        log.info(second)
-
-        (succeeded, failed) = (first, second) if isinstance(second, Exception) else (second, first)
-        assert isinstance(failed, Exception)
-        assert isinstance(succeeded, Dict)
-
-        # FIXME: there's probably multiple valid status codes:
-        # - Timeline 62505b9a9f6b1d29117b1b74eaf07b12/56cd19d3b2dbcc65e9d53ec6ca304f24 already exists
-        # - whatever 409 response says, but that is a subclass of PageserverApiException
-        assert isinstance(failed, PageserverApiException)
-        assert succeeded["state"] == "Active"
-    finally:
-        # we might still have the failpoint active
-        env.pageserver.stop(immediate=True)
-
-        # pytest should nag if we leave threads unjoined
-        for t in threads:
-            t.join()
-        create_root.join()
-
-
-def test_non_uploaded_branch_availability_after_restart(neon_env_builder: NeonEnvBuilder):
-    """
-    Currently before RFC#27 we keep and continue uploading branches which were not successfully uploaded before shutdown.
-
-    This test likely duplicates some other test, but it's easier to write one than to make sure there will be a failing test when the rfc is implemented.
-    """
-
-    env = neon_env_builder.init_configs()
-    env.start()
-
-    env.pageserver.allowed_errors.append(
-        ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
-    )
-    ps_http = env.pageserver.http_client()
-
-    # pause all uploads
-    ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
-    ps_http.tenant_create(env.initial_tenant)
-
-    def start_creating_timeline():
-        with pytest.raises(RequestException):
-            ps_http.timeline_create(
-                env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
-            )
-
-    t = threading.Thread(target=start_creating_timeline)
-    try:
-        t.start()
-
-        wait_until_paused(env, "before-upload-index-pausable")
-    finally:
-        # FIXME: paused uploads bother shutdown
-        env.pageserver.stop(immediate=True)
-        t.join()
-
-    # now without a failpoint
-    env.pageserver.start()
-
-    wait_until_tenant_active(ps_http, env.initial_tenant)
-
-    # currently it lives on and will get eventually uploaded, but this will change
-    detail = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
-    assert detail["state"] == "Active"
-
-
-def wait_until_paused(env: NeonEnv, failpoint: str):
-    found = False
-    msg = f"at failpoint {failpoint}"
-    for _ in range(20):
-        time.sleep(1)
-        found = env.pageserver.log_contains(msg) is not None
-        if found:
-            break
-    assert found
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple, Type
 import psycopg2
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, VanillaPostgres
+from fixtures.neon_fixtures import VanillaPostgres
 from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
@@ -205,10 +205,6 @@ def test_ddl_forwarding(ddl: DdlForwardingContext):
    ddl.wait()
    assert ddl.dbs == {"stork": "cork"}

-    cur.execute("DROP DATABASE stork")
-    ddl.wait()
-    assert ddl.dbs == {}
-
    with pytest.raises(psycopg2.InternalError):
        ddl.failures(True)
        cur.execute("CREATE DATABASE failure WITH OWNER=cork")
@@ -221,94 +217,6 @@ def test_ddl_forwarding(ddl: DdlForwardingContext):
        ddl.failures(True)
        cur.execute("DROP DATABASE failure")
        ddl.wait()
-    assert ddl.dbs == {"failure": "cork"}
-    ddl.failures(False)
-
-    # Check that db is still in the Postgres after failure
-    cur.execute("SELECT datconnlimit FROM pg_database WHERE datname = 'failure'")
-    result = cur.fetchone()
-    if not result:
-        raise AssertionError("Database 'failure' not found")
-    # -2 means invalid database
-    # It should be invalid because cplane request failed
-    assert result[0] == -2, "Database 'failure' is not invalid"
-
-    # Check that repeated drop succeeds
-    cur.execute("DROP DATABASE failure")
-    ddl.wait()
-    assert ddl.dbs == {}
-
-    # DB should be absent in the Postgres
-    cur.execute("SELECT count(*) FROM pg_database WHERE datname = 'failure'")
-    result = cur.fetchone()
-    if not result:
-        raise AssertionError("Could not count databases")
-    assert result[0] == 0, "Database 'failure' still exists after drop"
+    ddl.pg.connect(dbname="failure")  # Ensure we can connect after a failed drop

    conn.close()
-
-
-# Assert that specified database has a specific connlimit, throwing an AssertionError otherwise
-# -2 means invalid database
-# -1 means no specific per-db limit (default)
-def assert_db_connlimit(endpoint: Any, db_name: str, connlimit: int, msg: str):
-    with endpoint.cursor() as cur:
-        cur.execute("SELECT datconnlimit FROM pg_database WHERE datname = %s", (db_name,))
-        result = cur.fetchone()
-        if not result:
-            raise AssertionError(f"Database '{db_name}' not found")
-        assert result[0] == connlimit, msg
-
-
-# Test that compute_ctl can deal with invalid databases (drop them).
-# If Postgres extension cannot reach cplane, then DROP will be aborted
-# and database will be marked as invalid. Then there are two recovery
-# flows:
-# 1. User can just repeat DROP DATABASE command until it succeeds
-# 2. User can ignore, then compute_ctl will drop invalid databases
-#    automatically during full configuration
-# Here we test the latter. The first one is tested in test_ddl_forwarding
-def test_ddl_forwarding_invalid_db(neon_simple_env: NeonEnv):
-    env = neon_simple_env
-    env.neon_cli.create_branch("test_ddl_forwarding_invalid_db", "empty")
-    endpoint = env.endpoints.create_start(
-        "test_ddl_forwarding_invalid_db",
-        # Some non-existent url
-        config_lines=["neon.console_url=http://localhost:9999/unknown/api/v0/roles_and_databases"],
-    )
-    log.info("postgres is running on 'test_ddl_forwarding_invalid_db' branch")
-
-    with endpoint.cursor() as cur:
-        cur.execute("SET neon.forward_ddl = false")
-        cur.execute("CREATE DATABASE failure")
-        cur.execute("COMMIT")
-
-    assert_db_connlimit(
-        endpoint, "failure", -1, "Database 'failure' doesn't have a valid connlimit"
-    )
-
-    with pytest.raises(psycopg2.InternalError):
-        with endpoint.cursor() as cur:
-            cur.execute("DROP DATABASE failure")
-            cur.execute("COMMIT")
-
-    # Should be invalid after failed drop
-    assert_db_connlimit(endpoint, "failure", -2, "Database 'failure' ins't invalid")
-
-    endpoint.stop()
-    endpoint.start()
-
-    # Still invalid after restart without full configuration
-    assert_db_connlimit(endpoint, "failure", -2, "Database 'failure' ins't invalid")
-
-    endpoint.stop()
-    endpoint.respec(skip_pg_catalog_updates=False)
-    endpoint.start()
-
-    # Should be cleaned up by compute_ctl during full configuration
-    with endpoint.cursor() as cur:
-        cur.execute("SELECT count(*) FROM pg_database WHERE datname = 'failure'")
-        result = cur.fetchone()
-        if not result:
-            raise AssertionError("Could not count databases")
-        assert result[0] == 0, "Database 'failure' still exists after restart"
--- a/test_runner/regress/test_duplicate_layers.py
+++ b/test_runner/regress/test_duplicate_layers.py
@@ -3,10 +3,7 @@ import time
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
-from fixtures.pageserver.utils import (
-    wait_for_upload_queue_empty,
-    wait_until_tenant_active,
-)
+from fixtures.pageserver.utils import wait_for_upload_queue_empty
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from requests.exceptions import ConnectionError

@@ -116,8 +113,6 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
    time.sleep(1)

    env.pageserver.start()
-    wait_until_tenant_active(pageserver_http, tenant_id)
-
    message = f".*duplicated L1 layer layer={l1_found.name}"
    env.pageserver.allowed_errors.append(message)

--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -10,7 +10,6 @@ of the pageserver are:
 """


-import enum
 import re
 import time
 from typing import Optional
@@ -82,7 +81,7 @@ def generate_uploads_and_deletions(
                    f"""
                INSERT INTO foo (id, val)
                SELECT g, '{data}'
-                FROM generate_series(1, 200) g
+                FROM generate_series(1, 20000) g
                ON CONFLICT (id) DO UPDATE
                SET val = EXCLUDED.val
                """,
@@ -104,22 +103,6 @@ def generate_uploads_and_deletions(
        assert gc_result["layers_removed"] > 0


-def read_all(
-    env: NeonEnv, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None
-):
-    if tenant_id is None:
-        tenant_id = env.initial_tenant
-    assert tenant_id is not None
-
-    if timeline_id is None:
-        timeline_id = env.initial_timeline
-    assert timeline_id is not None
-
-    env.pageserver.http_client()
-    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
-        endpoint.safe_psql("SELECT SUM(LENGTH(val)) FROM foo;")
-
-
 def get_metric_or_0(ps_http, metric: str) -> int:
    v = ps_http.get_metric_value(metric)
    return 0 if v is None else int(v)
@@ -293,28 +276,15 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
    assert get_deletion_queue_unexpected_errors(ps_http) == 0


-class KeepAttachment(str, enum.Enum):
-    KEEP = "keep"
-    LOSE = "lose"
-
-
-class ValidateBefore(str, enum.Enum):
-    VALIDATE = "validate"
-    NO_VALIDATE = "no-validate"
-
-
-@pytest.mark.parametrize("keep_attachment", [KeepAttachment.KEEP, KeepAttachment.LOSE])
-@pytest.mark.parametrize("validate_before", [ValidateBefore.VALIDATE, ValidateBefore.NO_VALIDATE])
+@pytest.mark.parametrize("keep_attachment", [True, False])
+@pytest.mark.parametrize("validate_before", [True, False])
 def test_deletion_queue_recovery(
-    neon_env_builder: NeonEnvBuilder,
-    pg_bin: PgBin,
-    keep_attachment: KeepAttachment,
-    validate_before: ValidateBefore,
+    neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, keep_attachment: bool, validate_before: bool
 ):
    """
-    :param keep_attachment: whether to re-attach after restart.  Else, we act as if some other
+    :param keep_attachment: If true, we re-attach after restart.  Else, we act as if some other
    node took the attachment while we were restarting.
-    :param validate_before: whether to wait for deletions to be validated before restart.  This
+    :param validate_before: If true, we wait for deletions to be validated before restart.  This
    makes them elegible to be executed after restart, if the same node keeps the attachment.
    """
    neon_env_builder.enable_generations = True
@@ -330,7 +300,7 @@ def test_deletion_queue_recovery(
        ("deletion-queue-before-execute", "return"),
    ]

-    if validate_before == ValidateBefore.NO_VALIDATE:
+    if not validate_before:
        failpoints.append(
            # Prevent deletion lists from being validated, we will test that they are
            # dropped properly during recovery.  'pause' is okay here because we kill
@@ -350,25 +320,20 @@ def test_deletion_queue_recovery(
    assert get_deletion_queue_unexpected_errors(ps_http) == 0
    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0

-    if validate_before == ValidateBefore.VALIDATE:
+    if validate_before:

        def assert_validation_complete():
            assert get_deletion_queue_submitted(ps_http) == get_deletion_queue_validated(ps_http)

        wait_until(20, 1, assert_validation_complete)
-
-        # The validatated keys statistic advances before the header is written, so we
-        # also wait to see the header hit the disk: this seems paranoid but the race
-        # can really happen on a heavily overloaded test machine.
-        def assert_header_written():
-            assert (env.pageserver.workdir / "deletion" / "header-01").exists()
-
-        wait_until(20, 1, assert_header_written)
+        # A short wait to let the DeletionHeader get written out, as this happens after
+        # the validated count gets incremented.
+        time.sleep(1)

    log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
    env.pageserver.stop(immediate=True)

-    if keep_attachment == KeepAttachment.LOSE:
+    if not keep_attachment:
        some_other_pageserver = 101010
        assert env.attachment_service is not None
        env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver)
@@ -387,7 +352,7 @@ def test_deletion_queue_recovery(
    ps_http.deletion_queue_flush(execute=True)
    wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))

-    if keep_attachment == KeepAttachment.KEEP or validate_before == ValidateBefore.VALIDATE:
+    if keep_attachment or validate_before:
        # - If we kept the attachment, then our pre-restart deletions should execute
        #   because on re-attach they were from the immediately preceding generation
        # - If we validated before restart, then the deletions should execute because the
@@ -413,118 +378,3 @@ def test_deletion_queue_recovery(

    assert get_deletion_queue_unexpected_errors(ps_http) == 0
    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
-
-
-def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    neon_env_builder.enable_generations = True
-    neon_env_builder.enable_pageserver_remote_storage(
-        RemoteStorageKind.MOCK_S3,
-    )
-    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
-
-    ps_http = env.pageserver.http_client()
-
-    generate_uploads_and_deletions(env)
-
-    env.pageserver.allowed_errors.extend(
-        [
-            # When the pageserver can't reach the control plane, it will complain
-            ".*calling control plane generation validation API failed.*",
-            # Emergency mode is a big deal, we log errors whenever it is used.
-            ".*Emergency mode!.*",
-        ]
-    )
-
-    # Simulate a major incident: the control plane goes offline
-    assert env.attachment_service is not None
-    env.attachment_service.stop()
-
-    # Remember how many validations had happened before the control plane went offline
-    validated = get_deletion_queue_validated(ps_http)
-
-    generate_uploads_and_deletions(env, init=False)
-
-    # The running pageserver should stop progressing deletions
-    time.sleep(10)
-    assert get_deletion_queue_validated(ps_http) == validated
-
-    # Restart the pageserver: ordinarily we would _avoid_ doing this during such an
-    # incident, but it might be unavoidable: if so, we want to be able to start up
-    # and serve clients.
-    env.pageserver.stop()  # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
-    env.pageserver.start(
-        overrides=("--pageserver-config-override=control_plane_emergency_mode=true",)
-    )
-
-    # The pageserver should provide service to clients
-    generate_uploads_and_deletions(env, init=False)
-
-    # The pageserver should neither validate nor execute any deletions, it should have
-    # loaded the DeletionLists from before though
-    time.sleep(10)
-    assert get_deletion_queue_depth(ps_http) > 0
-    assert get_deletion_queue_validated(ps_http) == 0
-    assert get_deletion_queue_executed(ps_http) == 0
-
-    # When the control plane comes back up, normal service should resume
-    env.attachment_service.start()
-
-    ps_http.deletion_queue_flush(execute=True)
-    assert get_deletion_queue_depth(ps_http) == 0
-    assert get_deletion_queue_validated(ps_http) > 0
-    assert get_deletion_queue_executed(ps_http) > 0
-
-    # The pageserver should work fine when subsequently restarted in non-emergency mode
-    env.pageserver.stop()  # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
-    env.pageserver.start()
-
-    generate_uploads_and_deletions(env, init=False)
-    ps_http.deletion_queue_flush(execute=True)
-    assert get_deletion_queue_depth(ps_http) == 0
-    assert get_deletion_queue_validated(ps_http) > 0
-    assert get_deletion_queue_executed(ps_http) > 0
-
-
-def evict_all_layers(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId):
-    timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
-    initial_local_layers = sorted(
-        list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
-    )
-    client = env.pageserver.http_client()
-    for layer in initial_local_layers:
-        if "ephemeral" in layer.name:
-            continue
-        log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}")
-        client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name)
-
-
-def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder):
-    """
-    Eviction and on-demand downloads exercise a particular code path where RemoteLayer is constructed
-    and must be constructed using the proper generation for the layer, which may not be the same generation
-    that the tenant is running in.
-    """
-    neon_env_builder.enable_generations = True
-    neon_env_builder.enable_pageserver_remote_storage(
-        RemoteStorageKind.MOCK_S3,
-    )
-    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
-    env.pageserver.http_client()
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-
-    generate_uploads_and_deletions(env)
-
-    read_all(env, tenant_id, timeline_id)
-    evict_all_layers(env, tenant_id, timeline_id)
-    read_all(env, tenant_id, timeline_id)
-
-    # This will cause the generation to increment
-    env.pageserver.stop()
-    env.pageserver.start()
-
-    # Now we are running as generation 2, but must still correctly remember that the layers
-    # we are evicting and downloading are from generation 1.
-    read_all(env, tenant_id, timeline_id)
-    evict_all_layers(env, tenant_id, timeline_id)
-    read_all(env, tenant_id, timeline_id)
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -4,7 +4,6 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.remote_storage import s3_storage
-from fixtures.utils import wait_until


 # Test restarting page server, while safekeeper and compute node keep
@@ -17,7 +16,8 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool)

    env = neon_env_builder.init_start()

-    endpoint = env.endpoints.create_start("main")
+    env.neon_cli.create_branch("test_pageserver_restart")
+    endpoint = env.endpoints.create_start("test_pageserver_restart")
    pageserver_http = env.pageserver.http_client()

    pg_conn = endpoint.connect()
@@ -75,52 +75,27 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool)
    cur.execute("SELECT count(*) FROM foo")
    assert cur.fetchone() == (100000,)

-    # Wait for metrics to indicate startup complete, so that we can know all
-    # startup phases will be reflected in the subsequent checks
-    def assert_complete():
-        for sample in pageserver_http.get_metrics().query_all(
-            "pageserver_startup_duration_seconds"
-        ):
-            labels = dict(sample.labels)
-            log.info(f"metric {labels['phase']}={sample.value}")
-            if labels["phase"] == "complete" and sample.value > 0:
-                return
-
-        raise AssertionError("No 'complete' metric yet")
-
-    wait_until(30, 1.0, assert_complete)
+    # Validate startup time metrics
+    metrics = pageserver_http.get_metrics()

    # Expectation callbacks: arg t is sample value, arg p is the previous phase's sample value
-    expectations = [
-        (
-            "initial",
-            lambda t, p: True,
-        ),  # make no assumptions about the initial time point, it could be 0 in theory
-        # Remote phase of initial_tenant_load should happen before overall phase is complete
-        ("initial_tenant_load_remote", lambda t, p: t >= 0.0 and t >= p),
+    expectations = {
+        "initial": lambda t, p: True,  # make no assumptions about the initial time point, it could be 0 in theory
        # Initial tenant load should reflect the delay we injected
-        ("initial_tenant_load", lambda t, p: t >= (tenant_load_delay_ms / 1000.0) and t >= p),
+        "initial_tenant_load": lambda t, p: t >= (tenant_load_delay_ms / 1000.0) and t >= p,
        # Subsequent steps should occur in expected order
-        ("initial_logical_sizes", lambda t, p: t > 0 and t >= p),
-        ("background_jobs_can_start", lambda t, p: t > 0 and t >= p),
-        ("complete", lambda t, p: t > 0 and t >= p),
-    ]
+        "initial_logical_sizes": lambda t, p: t > 0 and t >= p,
+        "background_jobs_can_start": lambda t, p: t > 0 and t >= p,
+        "complete": lambda t, p: t > 0 and t >= p,
+    }

-    # Accumulate the runtime of each startup phase
-    values = {}
-    metrics = pageserver_http.get_metrics()
    prev_value = None
    for sample in metrics.query_all("pageserver_startup_duration_seconds"):
-        phase = sample.labels["phase"]
+        labels = dict(sample.labels)
+        phase = labels["phase"]
        log.info(f"metric {phase}={sample.value}")
-        assert phase in [e[0] for e in expectations], f"Unexpected phase {phase}"
-        values[phase] = sample
-
-    # Apply expectations to the metrics retrieved
-    for phase, expectation in expectations:
-        assert phase in values, f"No data for phase {phase}"
-        sample = values[phase]
-        assert expectation(
+        assert phase in expectations, f"Unexpected phase {phase}"
+        assert expectations[phase](
            sample.value, prev_value
        ), f"Unexpected value for {phase}: {sample.value}"
        prev_value = sample.value
--- a/test_runner/regress/test_pageserver_restarts_under_workload.py
+++ b/test_runner/regress/test_pageserver_restarts_under_workload.py
@@ -17,8 +17,6 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB
    n_restarts = 10
    scale = 10

-    env.pageserver.allowed_errors.append(".*query handler.*failed.*Shutting down")
-
    def run_pgbench(connstr: str):
        log.info(f"Start a pgbench workload on pg {connstr}")
        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -188,7 +188,7 @@ def test_sql_over_http(static_proxy: NeonProxy):
            headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr},
            verify=str(static_proxy.test_output_dir / "proxy.crt"),
        )
-        assert response.status_code == 200, response.text
+        assert response.status_code == 200
        return response.json()

    rows = q("select 42 as answer")["rows"]
@@ -206,12 +206,6 @@ def test_sql_over_http(static_proxy: NeonProxy):
    rows = q("select $1::json->'a' as answer", [{"a": {"b": 42}}])["rows"]
    assert rows == [{"answer": {"b": 42}}]

-    rows = q("select $1::jsonb[] as answer", [[{}]])["rows"]
-    assert rows == [{"answer": [{}]}]
-
-    rows = q("select $1::jsonb[] as answer", [[{"foo": 1}, {"bar": 2}]])["rows"]
-    assert rows == [{"answer": [{"foo": 1}, {"bar": 2}]}]
-
    rows = q("select * from pg_class limit 1")["rows"]
    assert len(rows) == 1

--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -45,11 +45,14 @@ def test_tenant_delete_smoke(
        [
            # The deletion queue will complain when it encounters simulated S3 errors
            ".*deletion executor: DeleteObjects request failed.*",
-            # lucky race with stopping from flushing a layer we fail to schedule any uploads
-            ".*layer flush task.+: could not flush frozen layer: update_metadata_file",
        ]
    )

+    # lucky race with stopping from flushing a layer we fail to schedule any uploads
+    env.pageserver.allowed_errors.append(
+        ".*layer flush task.+: could not flush frozen layer: update_metadata_file"
+    )
+
    ps_http = env.pageserver.http_client()

    # first try to delete non existing tenant
@@ -191,9 +194,11 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
    )

    if simulate_failures:
-        env.pageserver.allowed_errors.append(
-            # The deletion queue will complain when it encounters simulated S3 errors
-            ".*deletion executor: DeleteObjects request failed.*",
+        env.pageserver.allowed_errors.extend(
+            [
+                # The deletion queue will complain when it encounters simulated S3 errors
+                ".*deletion executor: DeleteObjects request failed.*",
+            ]
        )

    ps_http = env.pageserver.http_client()
@@ -288,10 +293,6 @@ def test_tenant_delete_is_resumed_on_attach(
    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
-    env.pageserver.allowed_errors.append(
-        # lucky race with stopping from flushing a layer we fail to schedule any uploads
-        ".*layer flush task.+: could not flush frozen layer: update_metadata_file"
-    )

    tenant_id = env.initial_tenant

--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -752,9 +752,6 @@ def test_ignore_while_attaching(
    env.pageserver.allowed_errors.append(
        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
    )
-    # An endpoint is starting up concurrently with our detach, it can
-    # experience RPC failure due to shutdown.
-    env.pageserver.allowed_errors.append(".*query handler.*failed.*Shutting down")

    data_id = 1
    data_secret = "very secret secret"
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -6,7 +6,6 @@ from pathlib import Path
 from typing import List, Optional

 import asyncpg
-import pytest
 import toml
 from fixtures.log_helper import getLogger
 from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper
@@ -598,10 +597,7 @@ async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint, test_output_dir: Pat
    assert res == expected_sum


-# Do inserts while restarting postgres and messing with safekeeper addresses.
-# The test takes more than default 5 minutes on Postgres 16,
-# see https://github.com/neondatabase/neon/issues/5305
-@pytest.mark.timeout(600)
+# do inserts while restarting postgres and messing with safekeeper addresses
 def test_wal_lagging(neon_env_builder: NeonEnvBuilder, test_output_dir: Path):
    neon_env_builder.num_safekeepers = 3
    env = neon_env_builder.init_start()
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -31,14 +31,13 @@ futures = { version = "0.3" }
 futures-channel = { version = "0.3", features = ["sink"] }
 futures-core = { version = "0.3" }
 futures-executor = { version = "0.3" }
-futures-io = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 hex = { version = "0.4", features = ["serde"] }
 hyper = { version = "0.14", features = ["full"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
-log = { version = "0.4", default-features = false, features = ["kv_unstable", "std"] }
+log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
 num-bigint = { version = "0.4" }
@@ -48,7 +47,7 @@ prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
 regex-syntax = { version = "0.7" }
-reqwest = { version = "0.11", default-features = false, features = ["blocking", "default-tls", "json", "multipart", "rustls-tls", "stream"] }
+reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "multipart", "rustls-tls"] }
 ring = { version = "0.16", features = ["std"] }
 rustls = { version = "0.21", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
@@ -56,8 +55,7 @@ serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
 smallvec = { version = "1", default-features = false, features = ["write"] }
 socket2 = { version = "0.4", default-features = false, features = ["all"] }
-standback = { version = "0.2", default-features = false, features = ["std"] }
-time = { version = "0.3", features = ["macros", "serde-well-known"] }
+time = { version = "0.3", features = ["formatting", "macros", "parsing"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }
 tokio-util = { version = "0.7", features = ["codec", "io"] }
@@ -77,16 +75,14 @@ cc = { version = "1", default-features = false, features = ["parallel"] }
 either = { version = "1" }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
-log = { version = "0.4", default-features = false, features = ["kv_unstable", "std"] }
+log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
 prost = { version = "0.11" }
 regex = { version = "1" }
 regex-syntax = { version = "0.7" }
 serde = { version = "1", features = ["alloc", "derive"] }
-standback = { version = "0.2", default-features = false, features = ["std"] }
 syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] }
 syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] }
-time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }

 ### END HAKARI SECTION