pageserver: don't log deletion S3 op failures as errors

proxy: breakdown wake up failure metrics (#4933 )
## Problem close https://github.com/neondatabase/neon/issues/4702 ## Summary of changes This PR adds a new metrics for wake up errors and breaks it down by most common reasons (mostly follows the `could_retry` implementation).
2026-05-27 01:50:38 +00:00 · 2023-10-11 14:26:01 +01:00 · 2023-10-10 13:17:37 +01:00 · 2023-10-10 13:39:38 +02:00 · 2023-10-10 10:46:24 +01:00 · 2023-10-10 08:59:16 +01:00
64 changed files with 4278 additions and 3466 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1092,8 +1092,10 @@ jobs:
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+
+            # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2393,17 +2393,6 @@ dependencies = [
 "minimal-lexical",
 ]

-[[package]]
-name = "nostarve_queue"
-version = "0.1.0"
-dependencies = [
- "futures",
- "rand",
- "scopeguard",
- "tokio",
- "tracing",
-]
-
 [[package]]
 name = "notify"
 version = "5.2.0"
@@ -2715,7 +2704,6 @@ dependencies = [
 "itertools",
 "metrics",
 "nix 0.26.2",
- "nostarve_queue",
 "num-traits",
 "num_cpus",
 "once_cell",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,7 +26,6 @@ members = [
    "libs/tracing-utils",
    "libs/postgres_ffi/wal_craft",
    "libs/vm_monitor",
-    "libs/nostarve_queue",
 ]

 [workspace.package]
@@ -181,7 +180,6 @@ tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
 vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
-nostarve_queue = { path = "./libs/nostarve_queue" }

 ## Common library dependency
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -368,8 +368,8 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
 FROM build-deps AS plpgsql-check-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.4.0.tar.gz -O plpgsql_check.tar.gz && \
-    echo "9ba58387a279b35a3bfa39ee611e5684e6cddb2ba046ddb2c5190b3bd2ca254a plpgsql_check.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
+    echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -36,7 +36,7 @@ use utils::pid_file::{self, PidFileRead};
 // it's waiting. If the process hasn't started/stopped after 5 seconds,
 // it prints a notice that it's taking long, but keeps waiting.
 //
-const RETRY_UNTIL_SECS: u64 = 40;
+const RETRY_UNTIL_SECS: u64 = 10;
 const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
 const RETRY_INTERVAL_MILLIS: u64 = 100;
 const DOT_EVERY_RETRIES: u64 = 10;
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -116,6 +116,7 @@ fn main() -> Result<()> {
            "attachment_service" => handle_attachment_service(sub_args, &env),
            "safekeeper" => handle_safekeeper(sub_args, &env),
            "endpoint" => handle_endpoint(sub_args, &env),
+            "mappings" => handle_mappings(sub_args, &mut env),
            "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
            _ => bail!("unexpected subcommand {sub_name}"),
        };
@@ -816,6 +817,38 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
    Ok(())
 }

+fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
+    let (sub_name, sub_args) = match sub_match.subcommand() {
+        Some(ep_subcommand_data) => ep_subcommand_data,
+        None => bail!("no mappings subcommand provided"),
+    };
+
+    match sub_name {
+        "map" => {
+            let branch_name = sub_args
+                .get_one::<String>("branch-name")
+                .expect("branch-name argument missing");
+
+            let tenant_id = sub_args
+                .get_one::<String>("tenant-id")
+                .map(|x| TenantId::from_str(x))
+                .expect("tenant-id argument missing")
+                .expect("malformed tenant-id arg");
+
+            let timeline_id = sub_args
+                .get_one::<String>("timeline-id")
+                .map(|x| TimelineId::from_str(x))
+                .expect("timeline-id argument missing")
+                .expect("malformed timeline-id arg");
+
+            env.register_branch_mapping(branch_name.to_owned(), tenant_id, timeline_id)?;
+
+            Ok(())
+        }
+        other => unimplemented!("mappings subcommand {other}"),
+    }
+}
+
 fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
        let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
@@ -1084,6 +1117,7 @@ fn cli() -> Command {
    // --id, when using a pageserver command
    let pageserver_id_arg = Arg::new("pageserver-id")
        .long("id")
+        .global(true)
        .help("pageserver id")
        .required(false);
    // --pageserver-id when using a non-pageserver command
@@ -1254,17 +1288,20 @@ fn cli() -> Command {
            Command::new("pageserver")
                .arg_required_else_help(true)
                .about("Manage pageserver")
+                .arg(pageserver_id_arg)
                .subcommand(Command::new("status"))
-                .arg(pageserver_id_arg.clone())
-                .subcommand(Command::new("start").about("Start local pageserver")
-                .arg(pageserver_id_arg.clone())
-                .arg(pageserver_config_args.clone()))
-                .subcommand(Command::new("stop").about("Stop local pageserver")
-                .arg(pageserver_id_arg.clone())
-                            .arg(stop_mode_arg.clone()))
-                .subcommand(Command::new("restart").about("Restart local pageserver")
-                .arg(pageserver_id_arg.clone())
-                .arg(pageserver_config_args.clone()))
+                .subcommand(Command::new("start")
+                    .about("Start local pageserver")
+                    .arg(pageserver_config_args.clone())
+                )
+                .subcommand(Command::new("stop")
+                    .about("Stop local pageserver")
+                    .arg(stop_mode_arg.clone())
+                )
+                .subcommand(Command::new("restart")
+                    .about("Restart local pageserver")
+                    .arg(pageserver_config_args.clone())
+                )
        )
        .subcommand(
            Command::new("attachment_service")
@@ -1321,8 +1358,8 @@ fn cli() -> Command {
                    .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
                    .arg(endpoint_id_arg.clone())
                    .arg(tenant_id_arg.clone())
-                    .arg(branch_name_arg)
-                    .arg(timeline_id_arg)
+                    .arg(branch_name_arg.clone())
+                    .arg(timeline_id_arg.clone())
                    .arg(lsn_arg)
                    .arg(pg_port_arg)
                    .arg(http_port_arg)
@@ -1335,7 +1372,7 @@ fn cli() -> Command {
                .subcommand(
                    Command::new("stop")
                    .arg(endpoint_id_arg)
-                    .arg(tenant_id_arg)
+                    .arg(tenant_id_arg.clone())
                    .arg(
                        Arg::new("destroy")
                            .help("Also delete data directory (now optional, should be default in future)")
@@ -1346,6 +1383,18 @@ fn cli() -> Command {
                )

        )
+        .subcommand(
+            Command::new("mappings")
+                .arg_required_else_help(true)
+                .about("Manage neon_local branch name mappings")
+                .subcommand(
+                    Command::new("map")
+                        .about("Create new mapping which cannot exist already")
+                        .arg(branch_name_arg.clone())
+                        .arg(tenant_id_arg.clone())
+                        .arg(timeline_id_arg.clone())
+                )
+        )
        // Obsolete old name for 'endpoint'. We now just print an error if it's used.
        .subcommand(
            Command::new("pg")
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -18,7 +18,7 @@ use camino::Utf8PathBuf;
 use pageserver_api::models::{self, TenantInfo, TimelineInfo};
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
-use reqwest::blocking::{Client, ClientBuilder, RequestBuilder, Response};
+use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
 use utils::auth::{Claims, Scope};
@@ -93,7 +93,7 @@ impl PageServerNode {
            pg_connection_config: PgConnectionConfig::new_host_port(host, port),
            conf: conf.clone(),
            env: env.clone(),
-            http_client: ClientBuilder::new().timeout(None).build().unwrap(),
+            http_client: Client::new(),
            http_base_url: format!("http://{}/v1", conf.listen_http_addr),
        }
    }
--- a/download_all_layers.py
+++ b/download_all_layers.py
@@ -1,20 +0,0 @@
-import requests
-
-tenants = requests.get("http://localhost:15003/v1/tenant")
-tenants.raise_for_status()
-tenants = tenants.json()
-
-for tenant in tenants:
-    id = tenant["id"]
-    timelines = requests.get(f"http://localhost:15003/v1/tenant/{id}/timeline")
-    timelines.raise_for_status()
-    for timeline in timelines.json():
-        tid = timeline["tenant_id"]
-        tlid = timeline["timeline_id"]
-        layers = requests.get(f"http://localhost:15003/v1/tenant/{tid}/timeline/{tlid}/layer")
-        layers.raise_for_status()
-        layers = layers.json()
-        for l in layers["historic_layers"]:
-            if l["remote"] == False:
-                requests.get(f"http://localhost:15003/v1/tenant/{tid}/timeline/{tlid}/layer/{l['layer_file_name']}")
-
--- a/libs/nostarve_queue/Cargo.toml
+++ b/libs/nostarve_queue/Cargo.toml
@@ -1,14 +0,0 @@
-[package]
-name = "nostarve_queue"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-
-[dependencies]
-scopeguard.workspace = true
-tracing.workspace = true
-
-[dev-dependencies]
-futures.workspace = true
-rand.workspace = true
-tokio = { workspace = true, features = ["rt", "rt-multi-thread", "time"] }
--- a/libs/nostarve_queue/src/lib.rs
+++ b/libs/nostarve_queue/src/lib.rs
@@ -1,316 +0,0 @@
-//! Synchronization primitive to prevent starvation among concurrent tasks that do the same work.
-
-use std::{
-    collections::VecDeque,
-    fmt,
-    future::poll_fn,
-    sync::Mutex,
-    task::{Poll, Waker},
-};
-
-pub struct Queue<T> {
-    inner: Mutex<Inner<T>>,
-}
-
-struct Inner<T> {
-    waiters: VecDeque<usize>,
-    free: VecDeque<usize>,
-    slots: Vec<Option<(Option<Waker>, Option<T>)>>,
-}
-
-#[derive(Clone, Copy)]
-pub struct Position<'q, T> {
-    idx: usize,
-    queue: &'q Queue<T>,
-}
-
-impl<T> fmt::Debug for Position<'_, T> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.debug_struct("Position").field("idx", &self.idx).finish()
-    }
-}
-
-impl<T> Inner<T> {
-    #[cfg(not(test))]
-    #[inline]
-    fn integrity_check(&self) {}
-
-    #[cfg(test)]
-    fn integrity_check(&self) {
-        use std::collections::HashSet;
-        let waiters = self.waiters.iter().copied().collect::<HashSet<_>>();
-        let free = self.free.iter().copied().collect::<HashSet<_>>();
-        for (slot_idx, slot) in self.slots.iter().enumerate() {
-            match slot {
-                None => {
-                    assert!(!waiters.contains(&slot_idx));
-                    assert!(free.contains(&slot_idx));
-                }
-                Some((None, None)) => {
-                    assert!(waiters.contains(&slot_idx));
-                    assert!(!free.contains(&slot_idx));
-                }
-                Some((Some(_), Some(_))) => {
-                    assert!(!waiters.contains(&slot_idx));
-                    assert!(!free.contains(&slot_idx));
-                }
-                Some((Some(_), None)) => {
-                    assert!(waiters.contains(&slot_idx));
-                    assert!(!free.contains(&slot_idx));
-                }
-                Some((None, Some(_))) => {
-                    assert!(!waiters.contains(&slot_idx));
-                    assert!(!free.contains(&slot_idx));
-                }
-            }
-        }
-    }
-}
-
-impl<T> Queue<T> {
-    pub fn new(size: usize) -> Self {
-        Queue {
-            inner: Mutex::new(Inner {
-                waiters: VecDeque::new(),
-                free: (0..size).collect(),
-                slots: {
-                    let mut v = Vec::with_capacity(size);
-                    v.resize_with(size, || None);
-                    v
-                },
-            }),
-        }
-    }
-    pub fn begin(&self) -> Result<Position<T>, ()> {
-        #[cfg(test)]
-        tracing::trace!("get in line locking inner");
-        let mut inner = self.inner.lock().unwrap();
-        inner.integrity_check();
-        let my_waitslot_idx = inner
-            .free
-            .pop_front()
-            .expect("can't happen, len(slots) = len(waiters");
-        inner.waiters.push_back(my_waitslot_idx);
-        let prev = inner.slots[my_waitslot_idx].replace((None, None));
-        assert!(prev.is_none());
-        inner.integrity_check();
-        Ok(Position {
-            idx: my_waitslot_idx,
-            queue: &self,
-        })
-    }
-}
-
-impl<'q, T> Position<'q, T> {
-    pub fn complete_and_wait(self, datum: T) -> impl std::future::Future<Output = T> + 'q {
-        #[cfg(test)]
-        tracing::trace!("found victim locking waiters");
-        let mut inner = self.queue.inner.lock().unwrap();
-        inner.integrity_check();
-        let winner_idx = inner.waiters.pop_front().expect("we put ourselves in");
-        #[cfg(test)]
-        tracing::trace!(winner_idx, "putting victim into next waiters slot");
-        let winner_slot = inner.slots[winner_idx].as_mut().unwrap();
-        let prev = winner_slot.1.replace(datum);
-        assert!(
-            prev.is_none(),
-            "ensure we didn't mess up this simple ring buffer structure"
-        );
-        if let Some(waker) = winner_slot.0.take() {
-            #[cfg(test)]
-            tracing::trace!(winner_idx, "waking up winner");
-            waker.wake()
-        }
-        inner.integrity_check();
-        drop(inner); // the poll_fn locks it again
-
-        let mut poll_num = 0;
-        let mut drop_guard = Some(scopeguard::guard((), |()| {
-            panic!("must not drop this future until Ready");
-        }));
-
-        // take the victim that was found by someone else
-        poll_fn(move |cx| {
-            let my_waitslot_idx = self.idx;
-            poll_num += 1;
-            #[cfg(test)]
-            tracing::trace!(poll_num, "poll_fn locking waiters");
-            let mut inner = self.queue.inner.lock().unwrap();
-            inner.integrity_check();
-            let my_waitslot = inner.slots[self.idx].as_mut().unwrap();
-            // assert!(
-            //     poll_num <= 2,
-            //     "once we place the waker in the slot, next wakeup should have a result: {}",
-            //     my_waitslot.1.is_some()
-            // );
-            if let Some(res) = my_waitslot.1.take() {
-                #[cfg(test)]
-                tracing::trace!(poll_num, "have cache slot");
-                // above .take() resets the waiters slot to None
-                debug_assert!(my_waitslot.0.is_none());
-                debug_assert!(my_waitslot.1.is_none());
-                inner.slots[my_waitslot_idx] = None;
-                inner.free.push_back(my_waitslot_idx);
-                let _ = scopeguard::ScopeGuard::into_inner(drop_guard.take().unwrap());
-                inner.integrity_check();
-                return Poll::Ready(res);
-            }
-            // assert_eq!(poll_num, 1);
-            if !my_waitslot
-                .0
-                .as_ref()
-                .map(|existing| cx.waker().will_wake(existing))
-                .unwrap_or(false)
-            {
-                let prev = my_waitslot.0.replace(cx.waker().clone());
-                #[cfg(test)]
-                tracing::trace!(poll_num, prev_is_some = prev.is_some(), "updating waker");
-            }
-            inner.integrity_check();
-            #[cfg(test)]
-            tracing::trace!(poll_num, "waiting to be woken up");
-            Poll::Pending
-        })
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use std::{
-        sync::{
-            atomic::{AtomicBool, Ordering},
-            Arc,
-        },
-        task::Poll,
-        time::Duration,
-    };
-
-    use rand::RngCore;
-
-    #[tokio::test]
-    async fn in_order_completion_and_wait() {
-        let queue = super::Queue::new(2);
-
-        let q1 = queue.begin().unwrap();
-        let q2 = queue.begin().unwrap();
-
-        assert_eq!(q1.complete_and_wait(23).await, 23);
-        assert_eq!(q2.complete_and_wait(42).await, 42);
-    }
-
-    #[tokio::test]
-    async fn out_of_order_completion_and_wait() {
-        let queue = super::Queue::new(2);
-
-        let q1 = queue.begin().unwrap();
-        let q2 = queue.begin().unwrap();
-
-        let mut q2compfut = q2.complete_and_wait(23);
-
-        match futures::poll!(&mut q2compfut) {
-            Poll::Pending => {}
-            Poll::Ready(_) => panic!("should not be ready yet, it's queued after q1"),
-        }
-
-        let q1res = q1.complete_and_wait(42).await;
-        assert_eq!(q1res, 23);
-
-        let q2res = q2compfut.await;
-        assert_eq!(q2res, 42);
-    }
-
-    #[tokio::test]
-    async fn in_order_completion_out_of_order_wait() {
-        let queue = super::Queue::new(2);
-
-        let q1 = queue.begin().unwrap();
-        let q2 = queue.begin().unwrap();
-
-        let mut q1compfut = q1.complete_and_wait(23);
-
-        let mut q2compfut = q2.complete_and_wait(42);
-
-        match futures::poll!(&mut q2compfut) {
-            Poll::Pending => {
-                unreachable!("q2 should be ready, it wasn't first but q1 is serviced already")
-            }
-            Poll::Ready(x) => assert_eq!(x, 42),
-        }
-
-        assert_eq!(futures::poll!(&mut q1compfut), Poll::Ready(23));
-    }
-
-    #[tokio::test(flavor = "multi_thread")]
-    async fn stress() {
-        let ntasks = 8;
-        let queue_size = 8;
-        let queue = Arc::new(super::Queue::new(queue_size));
-
-        let stop = Arc::new(AtomicBool::new(false));
-
-        let mut tasks = vec![];
-        for i in 0..ntasks {
-            let jh = tokio::spawn({
-                let queue = Arc::clone(&queue);
-                let stop = Arc::clone(&stop);
-                async move {
-                    while !stop.load(Ordering::Relaxed) {
-                        let q = queue.begin().unwrap();
-                        for _ in 0..(rand::thread_rng().next_u32() % 10_000) {
-                            std::hint::spin_loop();
-                        }
-                        q.complete_and_wait(i).await;
-                        tokio::task::yield_now().await;
-                    }
-                }
-            });
-            tasks.push(jh);
-        }
-
-        tokio::time::sleep(Duration::from_secs(10)).await;
-
-        stop.store(true, Ordering::Relaxed);
-
-        for t in tasks {
-            t.await.unwrap();
-        }
-    }
-
-    #[test]
-    fn stress_two_runtimes_shared_queue() {
-        std::thread::scope(|s| {
-            let ntasks = 8;
-            let queue_size = 8;
-            let queue = Arc::new(super::Queue::new(queue_size));
-
-            let stop = Arc::new(AtomicBool::new(false));
-
-            for i in 0..ntasks {
-                s.spawn({
-                    let queue = Arc::clone(&queue);
-                    let stop = Arc::clone(&stop);
-                    move || {
-                        let rt = tokio::runtime::Builder::new_current_thread()
-                            .enable_all()
-                            .build()
-                            .unwrap();
-                        rt.block_on(async move {
-                            while !stop.load(Ordering::Relaxed) {
-                                let q = queue.begin().unwrap();
-                                for _ in 0..(rand::thread_rng().next_u32() % 10_000) {
-                                    std::hint::spin_loop();
-                                }
-                                q.complete_and_wait(i).await;
-                                tokio::task::yield_now().await;
-                            }
-                        });
-                    }
-                });
-            }
-
-            std::thread::sleep(Duration::from_secs(10));
-
-            stop.store(true, Ordering::Relaxed);
-        });
-    }
-}
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -442,10 +442,20 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            trace!("got message {:?}", msg);

            let result = self.process_message(handler, msg, &mut query_string).await;
-            self.flush().await?;
+            tokio::select!(
+                biased;
+                _ = shutdown_watcher() => {
+                    // We were requested to shut down.
+                    tracing::info!("shutdown request received during response flush");
+                    return Ok(())
+                },
+                flush_r = self.flush() => {
+                    flush_r?;
+                }
+            );
+
            match result? {
                ProcessMsgResult::Continue => {
-                    self.flush().await?;
                    continue;
                }
                ProcessMsgResult::Break => break,
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -1,8 +1,9 @@
 use hyper::{header, Body, Response, StatusCode};
 use serde::{Deserialize, Serialize};
+use std::borrow::Cow;
 use std::error::Error as StdError;
 use thiserror::Error;
-use tracing::error;
+use tracing::{error, info};

 #[derive(Debug, Error)]
 pub enum ApiError {
@@ -25,7 +26,7 @@ pub enum ApiError {
    PreconditionFailed(Box<str>),

    #[error("Resource temporarily unavailable: {0}")]
-    ResourceUnavailable(String),
+    ResourceUnavailable(Cow<'static, str>),

    #[error("Shutting down")]
    ShuttingDown,
@@ -115,10 +116,12 @@ pub async fn route_error_handler(err: routerify::RouteError) -> Response<Body> {

 pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
    // Print a stack trace for Internal Server errors
-    if let ApiError::InternalServerError(_) = api_error {
-        error!("Error processing HTTP request: {api_error:?}");
-    } else {
-        error!("Error processing HTTP request: {api_error:#}");
+
+    match api_error {
+        ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
+        ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
+        ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
+        _ => error!("Error processing HTTP request: {api_error:#}"),
    }

    api_error.into_response()
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -58,7 +58,7 @@ where
 // to get that.
 impl<T: Ord> PartialOrd for Waiter<T> {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        other.wake_num.partial_cmp(&self.wake_num)
+        Some(self.cmp(other))
    }
 }

--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -37,7 +37,6 @@ humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
 nix.workspace = true
-nostarve_queue.workspace = true
 # hack to get the number of worker threads tokio uses
 num_cpus = { version = "1.15" }
 num-traits.workspace = true
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -580,31 +580,6 @@ fn start_pageserver(
        );
    }

-    task_mgr::spawn(
-        BACKGROUND_RUNTIME.handle(),
-        TaskKind::BackgroundRuntimeTurnaroundMeasure,
-        None,
-        None,
-        "background runtime turnaround measure",
-        true,
-        async move {
-            let server = hyper::Server::try_bind(&"0.0.0.0:2342".parse().unwrap()).expect("bind");
-            let server = server
-                .serve(hyper::service::make_service_fn(|_| async move {
-                    Ok::<_, std::convert::Infallible>(hyper::service::service_fn(
-                        move |_: hyper::Request<hyper::Body>| async move {
-                            Ok::<_, std::convert::Infallible>(hyper::Response::new(
-                                hyper::Body::from(format!("alive")),
-                            ))
-                        },
-                    ))
-                }))
-                .with_graceful_shutdown(task_mgr::shutdown_watcher());
-            server.await?;
-            Ok(())
-        },
-    );
-
    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());

    // All started up! Now just sit and wait for shutdown signal.
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -211,6 +211,10 @@ pub struct PageServerConf {

    /// JWT token for use with the control plane API.
    pub control_plane_api_token: Option<SecretString>,
+
+    /// If true, pageserver will make best-effort to operate without a control plane: only
+    /// for use in major incidents.
+    pub control_plane_emergency_mode: bool,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -288,6 +292,7 @@ struct PageServerConfigBuilder {

    control_plane_api: BuilderValue<Option<Url>>,
    control_plane_api_token: BuilderValue<Option<SecretString>>,
+    control_plane_emergency_mode: BuilderValue<bool>,
 }

 impl Default for PageServerConfigBuilder {
@@ -355,6 +360,7 @@ impl Default for PageServerConfigBuilder {

            control_plane_api: Set(None),
            control_plane_api_token: Set(None),
+            control_plane_emergency_mode: Set(false),
        }
    }
 }
@@ -491,6 +497,10 @@ impl PageServerConfigBuilder {
        self.control_plane_api_token = BuilderValue::Set(token)
    }

+    pub fn control_plane_emergency_mode(&mut self, enabled: bool) {
+        self.control_plane_emergency_mode = BuilderValue::Set(enabled)
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_size_logical_size_queries = self
            .concurrent_tenant_size_logical_size_queries
@@ -582,6 +592,9 @@ impl PageServerConfigBuilder {
            control_plane_api_token: self
                .control_plane_api_token
                .ok_or(anyhow!("missing control_plane_api_token"))?,
+            control_plane_emergency_mode: self
+                .control_plane_emergency_mode
+                .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
        })
    }
 }
@@ -807,6 +820,10 @@ impl PageServerConf {
                        builder.control_plane_api_token(Some(parsed.into()))
                    }
                },
+                "control_plane_emergency_mode" => {
+                    builder.control_plane_emergency_mode(parse_toml_bool(key, item)?)
+
+                },
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -976,6 +993,7 @@ impl PageServerConf {
            background_task_maximum_delay: Duration::ZERO,
            control_plane_api: None,
            control_plane_api_token: None,
+            control_plane_emergency_mode: false,
        }
    }
 }
@@ -1199,7 +1217,8 @@ background_task_maximum_delay = '334 s'
                    defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
                )?,
                control_plane_api: None,
-                control_plane_api_token: None
+                control_plane_api_token: None,
+                control_plane_emergency_mode: false
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1255,7 +1274,8 @@ background_task_maximum_delay = '334 s'
                ondemand_download_behavior_treat_error_as_warn: false,
                background_task_maximum_delay: Duration::from_secs(334),
                control_plane_api: None,
-                control_plane_api_token: None
+                control_plane_api_token: None,
+                control_plane_emergency_mode: false
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -2,7 +2,6 @@
 //! and push them to a HTTP endpoint.
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
@@ -144,7 +143,7 @@ pub async fn collect_metrics(
        crate::tenant::tasks::warn_when_period_overrun(
            tick_at.elapsed(),
            metric_collection_interval,
-            BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
+            "consumption_metrics_collect_metrics",
        );
    }
 }
@@ -269,11 +268,6 @@ async fn calculate_synthetic_size_worker(
            }

            if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
-                // TODO should we just use concurrent_background_tasks_rate_limit().
-                // We can put in some prioritization for consumption metrics.
-                // Same for the loop that fetches computed metrics.
-                // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
-                // which turns out is really handy to understand the system.
                if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
                    error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
                }
@@ -283,7 +277,7 @@ async fn calculate_synthetic_size_worker(
        crate::tenant::tasks::warn_when_period_overrun(
            tick_at.elapsed(),
            synthetic_size_calculation_interval,
-            BackgroundLoopKind::ConsumptionMetricsSyntheticSizeWorker,
+            "consumption_metrics_synthetic_size_worker",
        );
    }
 }
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -133,6 +133,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
            node_id: self.node_id,
        };

+        fail::fail_point!("control-plane-client-re-attach");
+
        let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
        tracing::info!(
            "Received re-attach response with {} tenants",
@@ -168,6 +170,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                .collect(),
        };

+        fail::fail_point!("control-plane-client-validate");
+
        let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;

        Ok(response
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -40,7 +40,6 @@ use validator::ValidatorQueueMessage;

 use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};

-// TODO: adminstrative "panic button" config property to disable all deletions
 // TODO: configurable for how long to wait before executing deletions

 /// We aggregate object deletions from many tenants in one place, for several reasons:
@@ -154,7 +153,7 @@ impl FlushOp {

 #[derive(Clone, Debug)]
 pub struct DeletionQueueClient {
-    tx: tokio::sync::mpsc::Sender<ListWriterQueueMessage>,
+    tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
    executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,

    lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
@@ -186,7 +185,7 @@ where
    V: Serialize,
    I: AsRef<[u8]>,
 {
-    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));
+    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v));

    transformed
        .collect::<HashMap<String, &V>>()
@@ -213,7 +212,7 @@ where

 /// Files ending with this suffix will be ignored and erased
 /// during recovery as startup.
-const TEMP_SUFFIX: &str = ".tmp";
+const TEMP_SUFFIX: &str = "tmp";

 #[serde_as]
 #[derive(Debug, Serialize, Deserialize)]
@@ -325,10 +324,7 @@ impl DeletionList {
            return false;
        }

-        let timeline_entry = tenant_entry
-            .timelines
-            .entry(*timeline)
-            .or_insert_with(Vec::new);
+        let timeline_entry = tenant_entry.timelines.entry(*timeline).or_default();

        let timeline_remote_path = remote_timeline_path(tenant, timeline);

@@ -420,7 +416,7 @@ pub enum DeletionQueueError {
 impl DeletionQueueClient {
    pub(crate) fn broken() -> Self {
        // Channels whose receivers are immediately dropped.
-        let (tx, _rx) = tokio::sync::mpsc::channel(1);
+        let (tx, _rx) = tokio::sync::mpsc::unbounded_channel();
        let (executor_tx, _executor_rx) = tokio::sync::mpsc::channel(1);
        Self {
            tx,
@@ -432,12 +428,12 @@ impl DeletionQueueClient {
    /// This is cancel-safe.  If you drop the future before it completes, the message
    /// is not pushed, although in the context of the deletion queue it doesn't matter: once
    /// we decide to do a deletion the decision is always final.
-    async fn do_push<T>(
+    fn do_push<T>(
        &self,
-        queue: &tokio::sync::mpsc::Sender<T>,
+        queue: &tokio::sync::mpsc::UnboundedSender<T>,
        msg: T,
    ) -> Result<(), DeletionQueueError> {
-        match queue.send(msg).await {
+        match queue.send(msg) {
            Ok(_) => Ok(()),
            Err(e) => {
                // This shouldn't happen, we should shut down all tenants before
@@ -449,7 +445,7 @@ impl DeletionQueueClient {
        }
    }

-    pub(crate) async fn recover(
+    pub(crate) fn recover(
        &self,
        attached_tenants: HashMap<TenantId, Generation>,
    ) -> Result<(), DeletionQueueError> {
@@ -457,7 +453,6 @@ impl DeletionQueueClient {
            &self.tx,
            ListWriterQueueMessage::Recover(RecoverOp { attached_tenants }),
        )
-        .await
    }

    /// When a Timeline wishes to update the remote_consistent_lsn that it exposes to the outside
@@ -530,6 +525,21 @@ impl DeletionQueueClient {
            return self.flush_immediate().await;
        }

+        self.push_layers_sync(tenant_id, timeline_id, current_generation, layers)
+    }
+
+    /// When a Tenant has a generation, push_layers is always synchronous because
+    /// the ListValidator channel is an unbounded channel.
+    ///
+    /// This can be merged into push_layers when we remove the Generation-less mode
+    /// support (`<https://github.com/neondatabase/neon/issues/5395>`)
+    pub(crate) fn push_layers_sync(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        current_generation: Generation,
+        layers: Vec<(LayerFileName, Generation)>,
+    ) -> Result<(), DeletionQueueError> {
        metrics::DELETION_QUEUE
            .keys_submitted
            .inc_by(layers.len() as u64);
@@ -543,17 +553,16 @@ impl DeletionQueueClient {
                objects: Vec::new(),
            }),
        )
-        .await
    }

    /// This is cancel-safe.  If you drop the future the flush may still happen in the background.
    async fn do_flush<T>(
        &self,
-        queue: &tokio::sync::mpsc::Sender<T>,
+        queue: &tokio::sync::mpsc::UnboundedSender<T>,
        msg: T,
        rx: tokio::sync::oneshot::Receiver<()>,
    ) -> Result<(), DeletionQueueError> {
-        self.do_push(queue, msg).await?;
+        self.do_push(queue, msg)?;
        if rx.await.is_err() {
            // This shouldn't happen if tenants are shut down before deletion queue.  If we
            // encounter a bug like this, then a flusher will incorrectly believe it has flushed
@@ -574,6 +583,18 @@ impl DeletionQueueClient {
            .await
    }

+    /// Issue a flush without waiting for it to complete.  This is useful on advisory flushes where
+    /// the caller wants to avoid the risk of waiting for lots of enqueued work, such as on tenant
+    /// detach where flushing is nice but not necessary.
+    ///
+    /// This function provides no guarantees of work being done.
+    pub fn flush_advisory(&self) {
+        let (flush_op, _) = FlushOp::new();
+
+        // Transmit the flush message, ignoring any result (such as a closed channel during shutdown).
+        drop(self.tx.send(ListWriterQueueMessage::FlushExecute(flush_op)));
+    }
+
    // Wait until all previous deletions are executed
    pub(crate) async fn flush_execute(&self) -> Result<(), DeletionQueueError> {
        debug!("flush_execute: flushing to deletion lists...");
@@ -590,9 +611,7 @@ impl DeletionQueueClient {
        // Flush any immediate-mode deletions (the above backend flush will only flush
        // the executor if deletions had flowed through the backend)
        debug!("flush_execute: flushing execution...");
-        let (flush_op, rx) = FlushOp::new();
-        self.do_flush(&self.executor_tx, DeleterMessage::Flush(flush_op), rx)
-            .await?;
+        self.flush_immediate().await?;
        debug!("flush_execute: finished flushing execution...");
        Ok(())
    }
@@ -647,8 +666,10 @@ impl DeletionQueue {
    where
        C: ControlPlaneGenerationsApi + Send + Sync,
    {
-        // Deep channel: it consumes deletions from all timelines and we do not want to block them
-        let (tx, rx) = tokio::sync::mpsc::channel(16384);
+        // Unbounded channel: enables non-async functions to submit deletions.  The actual length is
+        // constrained by how promptly the ListWriter wakes up and drains it, which should be frequent
+        // enough to avoid this taking pathologically large amount of memory.
+        let (tx, rx) = tokio::sync::mpsc::unbounded_channel();

        // Shallow channel: it carries DeletionLists which each contain up to thousands of deletions
        let (backend_tx, backend_rx) = tokio::sync::mpsc::channel(16);
@@ -961,7 +982,7 @@ mod test {
        // Basic test that the deletion queue processes the deletions we pass into it
        let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::new()).await?;
+        client.recover(HashMap::new())?;

        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
        let tenant_id = ctx.harness.tenant_id;
@@ -1029,7 +1050,7 @@ mod test {
    async fn deletion_queue_validation() -> anyhow::Result<()> {
        let ctx = setup("deletion_queue_validation").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::new()).await?;
+        client.recover(HashMap::new())?;

        // Generation that the control plane thinks is current
        let latest_generation = Generation::new(0xdeadbeef);
@@ -1086,7 +1107,7 @@ mod test {
        // Basic test that the deletion queue processes the deletions we pass into it
        let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::new()).await?;
+        client.recover(HashMap::new())?;

        let tenant_id = ctx.harness.tenant_id;

@@ -1149,9 +1170,7 @@ mod test {
        drop(client);
        ctx.restart().await;
        let client = ctx.deletion_queue.new_client();
-        client
-            .recover(HashMap::from([(tenant_id, now_generation)]))
-            .await?;
+        client.recover(HashMap::from([(tenant_id, now_generation)]))?;

        info!("Flush-executing");
        client.flush_execute().await?;
@@ -1177,7 +1196,7 @@ pub(crate) mod mock {
    };

    pub struct ConsumerState {
-        rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+        rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
        executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
    }

@@ -1254,7 +1273,7 @@ pub(crate) mod mock {
    }

    pub struct MockDeletionQueue {
-        tx: tokio::sync::mpsc::Sender<ListWriterQueueMessage>,
+        tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
        executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
        executed: Arc<AtomicUsize>,
        remote_storage: Option<GenericRemoteStorage>,
@@ -1264,7 +1283,7 @@ pub(crate) mod mock {

    impl MockDeletionQueue {
        pub fn new(remote_storage: Option<GenericRemoteStorage>) -> Self {
-            let (tx, rx) = tokio::sync::mpsc::channel(16384);
+            let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
            let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16384);

            let executed = Arc::new(AtomicUsize::new(0));
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -12,7 +12,6 @@ use remote_storage::MAX_KEYS_PER_DELETE;
 use std::time::Duration;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
-use tracing::warn;

 use crate::metrics;

@@ -88,7 +87,10 @@ impl Deleter {
                    self.accumulator.clear();
                }
                Err(e) => {
-                    warn!("DeleteObjects request failed: {e:#}, will retry");
+                    // The RemoteStorage interface doesn't discriminate between
+                    // real errors and 503/429 responses, so we log at INFO level
+                    // to avoid propagating spurious error-severity logs.
+                    info!("DeleteObjects request failed: {e:#}, will retry");
                    metrics::DELETION_QUEUE
                        .remote_errors
                        .with_label_values(&["execute"])
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -85,7 +85,7 @@ pub(super) struct ListWriter {
    conf: &'static PageServerConf,

    // Incoming frontend requests to delete some keys
-    rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+    rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,

    // Outbound requests to the backend to execute deletion lists we have composed.
    tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
@@ -111,7 +111,7 @@ impl ListWriter {

    pub(super) fn new(
        conf: &'static PageServerConf,
-        rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+        rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
        tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
        cancel: CancellationToken,
    ) -> Self {
@@ -230,6 +230,7 @@ impl ListWriter {
        let list_name_pattern =
            Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();

+        let temp_extension = format!(".{TEMP_SUFFIX}");
        let header_path = self.conf.deletion_header_path();
        let mut seqs: Vec<u64> = Vec::new();
        while let Some(dentry) = dir.next_entry().await? {
@@ -241,7 +242,7 @@ impl ListWriter {
                continue;
            }

-            if dentry_str.ends_with(TEMP_SUFFIX) {
+            if dentry_str.ends_with(&temp_extension) {
                info!("Cleaning up temporary file {dentry_str}");
                let absolute_path =
                    deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -220,6 +220,8 @@ where
                    warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
                    metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
                    mutated = true;
+                } else {
+                    metrics::DELETION_QUEUE.keys_validated.inc_by(tenant.len() as u64);
                }
                this_list_valid
            });
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -93,9 +93,16 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
    delete:
      description: |
-        Attempts to delete specified tenant. 500 and 409 errors should be retried until 404 is retrieved.
+        Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
        404 means that deletion successfully finished"
      responses:
        "400":
@@ -134,6 +141,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/timeline:
    parameters:
@@ -178,6 +192,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/timeline/{timeline_id}:
    parameters:
@@ -226,6 +247,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
    delete:
      description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
      responses:
@@ -265,13 +293,19 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/PreconditionFailedError"
-
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
    parameters:
@@ -328,6 +362,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
  /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
    parameters:
      - name: tenant_id
@@ -375,6 +416,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
  /v1/tenant/{tenant_id}/attach:
    parameters:
      - name: tenant_id
@@ -465,6 +513,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/detach:
    parameters:
@@ -518,6 +573,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/ignore:
    parameters:
@@ -560,6 +622,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/load:
    parameters:
@@ -604,6 +673,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/synthetic_size:
    parameters:
@@ -641,6 +717,12 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/{tenant_id}/size:
    parameters:
@@ -704,6 +786,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/timeline/:
    parameters:
@@ -780,6 +869,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
  /v1/tenant/:
    get:
      description: Get tenants list
@@ -810,6 +906,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
    post:
      description: |
        Create a tenant. Returns new tenant id on success.
@@ -860,6 +963,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/config:
    put:
@@ -905,6 +1015,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
  /v1/tenant/{tenant_id}/config/:
    parameters:
      - name: tenant_id
@@ -954,6 +1071,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
 components:
  securitySchemes:
    JWT:
@@ -1220,6 +1344,13 @@ components:
      properties:
        msg:
          type: string
+    ServiceUnavailableError:
+      type: object
+      required:
+        - msg
+      properties:
+        msg:
+          type: string
    NotFoundError:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -6,6 +6,7 @@ use std::sync::Arc;

 use anyhow::{anyhow, Context, Result};
 use futures::TryFutureExt;
+use hyper::header::CONTENT_TYPE;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
@@ -133,7 +134,7 @@ impl From<PageReconstructError> for ApiError {
                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
            }
            PageReconstructError::AncestorStopping(_) => {
-                ApiError::ResourceUnavailable(format!("{pre}"))
+                ApiError::ResourceUnavailable(format!("{pre}").into())
            }
            PageReconstructError::WalRedo(pre) => {
                ApiError::InternalServerError(anyhow::Error::new(pre))
@@ -146,7 +147,7 @@ impl From<TenantMapInsertError> for ApiError {
    fn from(tmie: TenantMapInsertError) -> ApiError {
        match tmie {
            TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
-                ApiError::ResourceUnavailable(format!("{tmie}"))
+                ApiError::ResourceUnavailable(format!("{tmie}").into())
            }
            TenantMapInsertError::TenantAlreadyExists(id, state) => {
                ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
@@ -395,6 +396,9 @@ async fn timeline_create_handler(
                    format!("{err:#}")
                ))
            }
+            Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
+                json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
+            }
            Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
        }
    }
@@ -571,9 +575,14 @@ async fn tenant_detach_handler(

    let state = get_state(&request);
    let conf = state.conf;
-    mgr::detach_tenant(conf, tenant_id, detach_ignored.unwrap_or(false))
-        .instrument(info_span!("tenant_detach", %tenant_id))
-        .await?;
+    mgr::detach_tenant(
+        conf,
+        tenant_id,
+        detach_ignored.unwrap_or(false),
+        &state.deletion_queue_client,
+    )
+    .instrument(info_span!("tenant_detach", %tenant_id))
+    .await?;

    json_response(StatusCode::OK, ())
 }
@@ -636,7 +645,7 @@ async fn tenant_list_handler(
        .instrument(info_span!("tenant_list"))
        .await
        .map_err(|_| {
-            ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".to_string())
+            ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
        })?
        .iter()
        .map(|(id, state)| TenantInfo {
@@ -1030,7 +1039,7 @@ async fn put_tenant_location_config_handler(
    // The `Detached` state is special, it doesn't upsert a tenant, it removes
    // its local disk content and drops it from memory.
    if let LocationConfigMode::Detached = request_data.config.mode {
-        mgr::detach_tenant(conf, tenant_id, true)
+        mgr::detach_tenant(conf, tenant_id, true, &state.deletion_queue_client)
            .instrument(info_span!("tenant_detach", %tenant_id))
            .await?;
        return json_response(StatusCode::OK, ());
@@ -1236,6 +1245,136 @@ async fn deletion_queue_flush(
    }
 }

+/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
+async fn getpage_at_lsn_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    struct Key(crate::repository::Key);
+
+    impl std::str::FromStr for Key {
+        type Err = anyhow::Error;
+
+        fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+            crate::repository::Key::from_hex(s).map(Key)
+        }
+    }
+
+    let key: Key = parse_query_param(&request, "key")?
+        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'key' query parameter")))?;
+    let lsn: Lsn = parse_query_param(&request, "lsn")?
+        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
+
+    async {
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+
+        let page = timeline.get(key.0, lsn, &ctx).await?;
+
+        Result::<_, ApiError>::Ok(
+            Response::builder()
+                .status(StatusCode::OK)
+                .header(CONTENT_TYPE, "application/octet-stream")
+                .body(hyper::Body::from(page))
+                .unwrap(),
+        )
+    }
+    .instrument(info_span!("timeline_get", %tenant_id, %timeline_id))
+    .await
+}
+
+async fn timeline_collect_keyspace(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    struct Partitioning {
+        keys: crate::keyspace::KeySpace,
+
+        at_lsn: Lsn,
+    }
+
+    impl serde::Serialize for Partitioning {
+        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+        where
+            S: serde::Serializer,
+        {
+            use serde::ser::SerializeMap;
+            let mut map = serializer.serialize_map(Some(2))?;
+            map.serialize_key("keys")?;
+            map.serialize_value(&KeySpace(&self.keys))?;
+            map.serialize_key("at_lsn")?;
+            map.serialize_value(&WithDisplay(&self.at_lsn))?;
+            map.end()
+        }
+    }
+
+    struct WithDisplay<'a, T>(&'a T);
+
+    impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
+        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+        where
+            S: serde::Serializer,
+        {
+            serializer.collect_str(&self.0)
+        }
+    }
+
+    struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
+
+    impl<'a> serde::Serialize for KeySpace<'a> {
+        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+        where
+            S: serde::Serializer,
+        {
+            use serde::ser::SerializeSeq;
+            let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
+            for kr in &self.0.ranges {
+                seq.serialize_element(&KeyRange(kr))?;
+            }
+            seq.end()
+        }
+    }
+
+    struct KeyRange<'a>(&'a std::ops::Range<crate::repository::Key>);
+
+    impl<'a> serde::Serialize for KeyRange<'a> {
+        fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+        where
+            S: serde::Serializer,
+        {
+            use serde::ser::SerializeTuple;
+            let mut t = serializer.serialize_tuple(2)?;
+            t.serialize_element(&WithDisplay(&self.0.start))?;
+            t.serialize_element(&WithDisplay(&self.0.end))?;
+            t.end()
+        }
+    }
+
+    let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;
+
+    async {
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+        let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
+        let keys = timeline
+            .collect_keyspace(at_lsn, &ctx)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+
+        json_response(StatusCode::OK, Partitioning { keys, at_lsn })
+    }
+    .instrument(info_span!("timeline_collect_keyspace", %tenant_id, %timeline_id))
+    .await
+}
+
 async fn active_timeline_of_active_tenant(
    tenant_id: TenantId,
    timeline_id: TimelineId,
@@ -1583,5 +1722,12 @@ pub fn make_router(
        .post("/v1/tracing/event", |r| {
            testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
        })
+        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/getpage", |r| {
+            testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler)
+        })
+        .get(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/keyspace",
+            |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
+        )
        .any(handler_404))
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -314,6 +314,7 @@ static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
 #[strum(serialize_all = "kebab_case")]
 pub(crate) enum PageCacheErrorKind {
    AcquirePinnedSlotTimeout,
+    EvictIterLimit,
 }

 pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
@@ -690,10 +691,9 @@ impl StorageIoTime {
        .expect("failed to define a metric");
        let metrics = std::array::from_fn(|i| {
            let op = StorageIoOperation::from_repr(i).unwrap();
-            let metric = storage_io_histogram_vec
+            storage_io_histogram_vec
                .get_metric_with_label_values(&[op.as_str()])
-                .unwrap();
-            metric
+                .unwrap()
        });
        Self { metrics }
    }
@@ -966,6 +966,7 @@ pub(crate) struct DeletionQueueMetrics {
    pub(crate) keys_submitted: IntCounter,
    pub(crate) keys_dropped: IntCounter,
    pub(crate) keys_executed: IntCounter,
+    pub(crate) keys_validated: IntCounter,
    pub(crate) dropped_lsn_updates: IntCounter,
    pub(crate) unexpected_errors: IntCounter,
    pub(crate) remote_errors: IntCounterVec,
@@ -987,7 +988,13 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {

    keys_executed: register_int_counter!(
        "pageserver_deletion_queue_executed_total",
-        "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed."
+        "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed to completion"
+    )
+    .expect("failed to define a metric"),
+
+    keys_validated: register_int_counter!(
+        "pageserver_deletion_queue_validated_total",
+        "Number of keys validated for deletion.  Sum with pageserver_deletion_queue_dropped_total for the total number of keys that have passed through the validation stage."
    )
    .expect("failed to define a metric"),

@@ -1060,26 +1067,6 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

-pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy<IntCounterVec> =
-    Lazy::new(|| {
-        register_int_counter_vec!(
-            "pageserver_background_loop_semaphore_wait_start_count",
-            "Counter for background loop concurrency-limiting semaphore acquire calls started",
-            &["task"],
-        )
-        .unwrap()
-    });
-
-pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy<IntCounterVec> =
-    Lazy::new(|| {
-        register_int_counter_vec!(
-            "pageserver_background_loop_semaphore_wait_finish_count",
-            "Counter for background loop concurrency-limiting semaphore acquire calls finished",
-            &["task"],
-        )
-        .unwrap()
-    });
-
 pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_background_loop_period_overrun_count",
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -83,7 +83,6 @@ use std::{

 use anyhow::Context;
 use once_cell::sync::OnceCell;
-use tracing::instrument;
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
@@ -253,9 +252,6 @@ pub struct PageCache {
    next_evict_slot: AtomicUsize,

    size_metrics: &'static PageCacheSizeMetrics,
-
-    find_victim_waiters:
-        nostarve_queue::Queue<(usize, tokio::sync::RwLockWriteGuard<'static, SlotInner>)>,
 }

 struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
@@ -326,7 +322,7 @@ impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_> {
    fn as_mut(&mut self) -> &mut [u8; PAGE_SZ] {
        match &mut self.state {
            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
-            PageWriteGuardState::Downgraded => todo!(),
+            PageWriteGuardState::Downgraded => unreachable!(),
        }
    }
 }
@@ -443,9 +439,8 @@ impl PageCache {
    ///
    /// Store an image of the given page in the cache.
    ///
-    // #[cfg_attr(test, instrument(skip_all, level = "trace", fields(%key, %lsn)))]
    pub async fn memorize_materialized_page(
-        &'static self,
+        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        key: Key,
@@ -536,9 +531,8 @@ impl PageCache {

    // Section 1.2: Public interface functions for working with immutable file pages.

-    // #[cfg_attr(test, instrument(skip_all, level = "trace", fields(?file_id, ?blkno)))]
    pub async fn read_immutable_buf(
-        &'static self,
+        &self,
        file_id: FileId,
        blkno: u32,
        ctx: &RequestContext,
@@ -644,7 +638,7 @@ impl PageCache {
    /// ```
    ///
    async fn lock_for_read(
-        &'static self,
+        &self,
        cache_key: &mut CacheKey,
        ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
@@ -866,15 +860,10 @@ impl PageCache {
    ///
    /// On return, the slot is empty and write-locked.
    async fn find_victim(
-        &'static self,
+        &self,
        _permit_witness: &PinnedSlotsPermit,
    ) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
-        let nostarve_position = self.find_victim_waiters.begin()
-            .expect("we initialize the nostarve queue to the same size as the slots semaphore, and the caller is presenting a permit");
-
-        // let span = tracing::trace_span!("find_victim", ?nostarve_position);
-        // let _enter = span.enter();
-
+        let iter_limit = self.slots.len() * 10;
        let mut iters = 0;
        loop {
            iters += 1;
@@ -886,8 +875,41 @@ impl PageCache {
                let mut inner = match slot.inner.try_write() {
                    Ok(inner) => inner,
                    Err(_err) => {
-                        if iters > self.slots.len() * (MAX_USAGE_COUNT as usize) {
-                            unreachable!("find_victim_waiters prevents starvation");
+                        if iters > iter_limit {
+                            // NB: Even with the permits, there's no hard guarantee that we will find a slot with
+                            // any particular number of iterations: other threads might race ahead and acquire and
+                            // release pins just as we're scanning the array.
+                            //
+                            // Imagine that nslots is 2, and as starting point, usage_count==1 on all
+                            // slots. There are two threads running concurrently, A and B. A has just
+                            // acquired the permit from the semaphore.
+                            //
+                            //   A: Look at slot 1. Its usage_count == 1, so decrement it to zero, and continue the search
+                            //   B: Acquire permit.
+                            //   B: Look at slot 2, decrement its usage_count to zero and continue the search
+                            //   B: Look at slot 1. Its usage_count is zero, so pin it and bump up its usage_count to 1.
+                            //   B: Release pin and permit again
+                            //   B: Acquire permit.
+                            //   B: Look at slot 2. Its usage_count is zero, so pin it and bump up its usage_count to 1.
+                            //   B: Release pin and permit again
+                            //
+                            // Now we're back in the starting situation that both slots have
+                            // usage_count 1, but A has now been through one iteration of the
+                            // find_victim() loop. This can repeat indefinitely and on each
+                            // iteration, A's iteration count increases by one.
+                            //
+                            // So, even though the semaphore for the permits is fair, the victim search
+                            // itself happens in parallel and is not fair.
+                            // Hence even with a permit, a task can theoretically be starved.
+                            // To avoid this, we'd need tokio to give priority to tasks that are holding
+                            // permits for longer.
+                            // Note that just yielding to tokio during iteration without such
+                            // priority boosting is likely counter-productive. We'd just give more opportunities
+                            // for B to bump usage count, further starving A.
+                            crate::metrics::page_cache_errors_inc(
+                                crate::metrics::PageCacheErrorKind::EvictIterLimit,
+                            );
+                            anyhow::bail!("exceeded evict iter limit");
                        }
                        continue;
                    }
@@ -898,8 +920,7 @@ impl PageCache {
                    inner.key = None;
                }
                crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
-
-                return Ok(nostarve_position.complete_and_wait((slot_idx, inner)).await);
+                return Ok((slot_idx, inner));
            }
        }
    }
@@ -943,7 +964,6 @@ impl PageCache {
            next_evict_slot: AtomicUsize::new(0),
            size_metrics,
            pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
-            find_victim_waiters: ::nostarve_queue::Queue::new(num_pages),
        }
    }
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -35,6 +35,7 @@ use std::time::Duration;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
+use tokio_util::sync::CancellationToken;
 use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
@@ -64,69 +65,6 @@ use crate::trace::Tracer;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

-fn copyin_stream<IO>(pgb: &mut PostgresBackend<IO>) -> impl Stream<Item = io::Result<Bytes>> + '_
-where
-    IO: AsyncRead + AsyncWrite + Unpin,
-{
-    async_stream::try_stream! {
-        loop {
-            let msg = tokio::select! {
-                biased;
-
-                _ = task_mgr::shutdown_watcher() => {
-                    // We were requested to shut down.
-                    let msg = "pageserver is shutting down";
-                    let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
-                    Err(QueryError::Other(anyhow::anyhow!(msg)))
-                }
-
-                msg = pgb.read_message() => { msg.map_err(QueryError::from)}
-            };
-
-            match msg {
-                Ok(Some(message)) => {
-                    let copy_data_bytes = match message {
-                        FeMessage::CopyData(bytes) => bytes,
-                        FeMessage::CopyDone => { break },
-                        FeMessage::Sync => continue,
-                        FeMessage::Terminate => {
-                            let msg = "client terminated connection with Terminate message during COPY";
-                            let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                            // error can't happen here, ErrorResponse serialization should be always ok
-                            pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                            Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
-                            break;
-                        }
-                        m => {
-                            let msg = format!("unexpected message {m:?}");
-                            // error can't happen here, ErrorResponse serialization should be always ok
-                            pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
-                            Err(io::Error::new(io::ErrorKind::Other, msg))?;
-                            break;
-                        }
-                    };
-
-                    yield copy_data_bytes;
-                }
-                Ok(None) => {
-                    let msg = "client closed connection during COPY";
-                    let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                    // error can't happen here, ErrorResponse serialization should be always ok
-                    pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                    pgb.flush().await?;
-                    Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
-                }
-                Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
-                    Err(io_error)?;
-                }
-                Err(other) => {
-                    Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
-                }
-            };
-        }
-    }
-}
-
 /// Read the end of a tar archive.
 ///
 /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
@@ -284,7 +222,13 @@ async fn page_service_conn_main(
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
+    let mut conn_handler = PageServerHandler::new(
+        conf,
+        broker_client,
+        auth,
+        connection_ctx,
+        task_mgr::shutdown_token(),
+    );
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;

    match pgbackend
@@ -318,6 +262,10 @@ struct PageServerHandler {
    /// For each query received over the connection,
    /// `process_query` creates a child context from this one.
    connection_ctx: RequestContext,
+
+    /// A token that should fire when the tenant transitions from
+    /// attached state, or when the pageserver is shutting down.
+    cancel: CancellationToken,
 }

 impl PageServerHandler {
@@ -326,6 +274,7 @@ impl PageServerHandler {
        broker_client: storage_broker::BrokerClientChannel,
        auth: Option<Arc<JwtAuth>>,
        connection_ctx: RequestContext,
+        cancel: CancellationToken,
    ) -> Self {
        PageServerHandler {
            _conf: conf,
@@ -333,6 +282,91 @@ impl PageServerHandler {
            auth,
            claims: None,
            connection_ctx,
+            cancel,
+        }
+    }
+
+    /// Wrap PostgresBackend::flush to respect our CancellationToken: it is important to use
+    /// this rather than naked flush() in order to shut down promptly.  Without this, we would
+    /// block shutdown of a tenant if a postgres client was failing to consume bytes we send
+    /// in the flush.
+    async fn flush_cancellable<IO>(&self, pgb: &mut PostgresBackend<IO>) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        tokio::select!(
+            flush_r = pgb.flush() => {
+                Ok(flush_r?)
+            },
+            _ = self.cancel.cancelled() => {
+                Err(QueryError::Other(anyhow::anyhow!("Shutting down")))
+            }
+        )
+    }
+
+    fn copyin_stream<'a, IO>(
+        &'a self,
+        pgb: &'a mut PostgresBackend<IO>,
+    ) -> impl Stream<Item = io::Result<Bytes>> + 'a
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        async_stream::try_stream! {
+            loop {
+                let msg = tokio::select! {
+                    biased;
+
+                    _ = task_mgr::shutdown_watcher() => {
+                        // We were requested to shut down.
+                        let msg = "pageserver is shutting down";
+                        let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
+                        Err(QueryError::Other(anyhow::anyhow!(msg)))
+                    }
+
+                    msg = pgb.read_message() => { msg.map_err(QueryError::from)}
+                };
+
+                match msg {
+                    Ok(Some(message)) => {
+                        let copy_data_bytes = match message {
+                            FeMessage::CopyData(bytes) => bytes,
+                            FeMessage::CopyDone => { break },
+                            FeMessage::Sync => continue,
+                            FeMessage::Terminate => {
+                                let msg = "client terminated connection with Terminate message during COPY";
+                                let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                                // error can't happen here, ErrorResponse serialization should be always ok
+                                pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
+                                Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                                break;
+                            }
+                            m => {
+                                let msg = format!("unexpected message {m:?}");
+                                // error can't happen here, ErrorResponse serialization should be always ok
+                                pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
+                                Err(io::Error::new(io::ErrorKind::Other, msg))?;
+                                break;
+                            }
+                        };
+
+                        yield copy_data_bytes;
+                    }
+                    Ok(None) => {
+                        let msg = "client closed connection during COPY";
+                        let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                        // error can't happen here, ErrorResponse serialization should be always ok
+                        pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
+                        self.flush_cancellable(pgb).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
+                        Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                    }
+                    Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
+                        Err(io_error)?;
+                    }
+                    Err(other) => {
+                        Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
+                    }
+                };
+            }
        }
    }

@@ -372,7 +406,7 @@ impl PageServerHandler {

        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
-        pgb.flush().await?;
+        self.flush_cancellable(pgb).await?;

        let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);

@@ -465,7 +499,7 @@ impl PageServerHandler {
            });

            pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
-            pgb.flush().await?;
+            self.flush_cancellable(pgb).await?;
        }
        Ok(())
    }
@@ -508,9 +542,9 @@ impl PageServerHandler {
        // Import basebackup provided via CopyData
        info!("importing basebackup");
        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        pgb.flush().await?;
+        self.flush_cancellable(pgb).await?;

-        let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
        timeline
            .import_basebackup_from_tar(
                &mut copyin_reader,
@@ -563,8 +597,8 @@ impl PageServerHandler {
        // Import wal provided via CopyData
        info!("importing wal");
        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        pgb.flush().await?;
-        let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
+        self.flush_cancellable(pgb).await?;
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
        import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
        info!("wal import complete");

@@ -772,7 +806,7 @@ impl PageServerHandler {

        // switch client to COPYOUT
        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
-        pgb.flush().await?;
+        self.flush_cancellable(pgb).await?;

        // Send a tarball of the latest layer on the timeline. Compress if not
        // fullbackup. TODO Compress in that case too (tests need to be updated)
@@ -824,7 +858,7 @@ impl PageServerHandler {
        }

        pgb.write_message_noflush(&BeMessage::CopyDone)?;
-        pgb.flush().await?;
+        self.flush_cancellable(pgb).await?;

        let basebackup_after = started
            .elapsed()
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -293,8 +293,6 @@ pub enum TaskKind {

    DebugTool,

-    BackgroundRuntimeTurnaroundMeasure,
-
    #[cfg(test)]
    UnitTest,
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -45,6 +45,7 @@ use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};

 use self::config::AttachedLocationConfig;
+use self::config::AttachmentMode;
 use self::config::LocationConf;
 use self::config::TenantConf;
 use self::delete::DeleteTenantFlow;
@@ -406,6 +407,8 @@ pub enum CreateTimelineError {
    AlreadyExists,
    #[error(transparent)]
    AncestorLsn(anyhow::Error),
+    #[error("ancestor timeline is not active")]
+    AncestorNotActive,
    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }
@@ -1587,6 +1590,12 @@ impl Tenant {
                    .get_timeline(ancestor_timeline_id, false)
                    .context("Cannot branch off the timeline that's not present in pageserver")?;

+                // instead of waiting around, just deny the request because ancestor is not yet
+                // ready for other purposes either.
+                if !ancestor_timeline.is_active() {
+                    return Err(CreateTimelineError::AncestorNotActive);
+                }
+
                if let Some(lsn) = ancestor_start_lsn.as_mut() {
                    *lsn = lsn.align();

@@ -1619,8 +1628,6 @@ impl Tenant {
            }
        };

-        loaded_timeline.activate(broker_client, None, ctx);
-
        if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
            // Wait for the upload of the 'index_part.json` file to finish, so that when we return
            // Ok, the timeline is durable in remote storage.
@@ -1632,6 +1639,8 @@ impl Tenant {
            })?;
        }

+        loaded_timeline.activate(broker_client, None, ctx);
+
        Ok(loaded_timeline)
    }

@@ -2068,6 +2077,15 @@ impl Tenant {
            }
        }
    }
+
+    pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
+        self.tenant_conf
+            .read()
+            .unwrap()
+            .location
+            .attach_mode
+            .clone()
+    }
 }

 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -24,7 +24,7 @@ use crate::control_plane_client::{
 };
 use crate::deletion_queue::DeletionQueueClient;
 use crate::task_mgr::{self, TaskKind};
-use crate::tenant::config::{LocationConf, LocationMode, TenantConfOpt};
+use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::{
    create_tenant_files, AttachedTenantConf, CreateTenantFilesMode, Tenant, TenantState,
@@ -151,61 +151,85 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U

 static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::Initializing));

-/// Initialize repositories with locally available timelines.
-/// Timelines that are only partially available locally (remote storage has more data than this pageserver)
-/// are scheduled for download and added to the tenant once download is completed.
-#[instrument(skip_all)]
-pub async fn init_tenant_mgr(
+fn emergency_generations(
+    tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
+) -> HashMap<TenantId, Generation> {
+    tenant_confs
+        .iter()
+        .filter_map(|(tid, lc)| {
+            let lc = match lc {
+                Ok(lc) => lc,
+                Err(_) => return None,
+            };
+            let gen = match &lc.mode {
+                LocationMode::Attached(alc) => Some(alc.generation),
+                LocationMode::Secondary(_) => None,
+            };
+
+            gen.map(|g| (*tid, g))
+        })
+        .collect()
+}
+
+async fn init_load_generations(
    conf: &'static PageServerConf,
-    resources: TenantSharedResources,
-    init_order: InitializationOrder,
-    cancel: CancellationToken,
-) -> anyhow::Result<()> {
-    // Scan local filesystem for attached tenants
-    let tenants_dir = conf.tenants_path();
-
-    let mut tenants = HashMap::new();
-
-    // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
-    let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) {
-        let result = match client.re_attach().await {
+    tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
+    resources: &TenantSharedResources,
+    cancel: &CancellationToken,
+) -> anyhow::Result<Option<HashMap<TenantId, Generation>>> {
+    let generations = if conf.control_plane_emergency_mode {
+        error!(
+            "Emergency mode!  Tenants will be attached unsafely using their last known generation"
+        );
+        emergency_generations(tenant_confs)
+    } else if let Some(client) = ControlPlaneClient::new(conf, cancel) {
+        info!("Calling control plane API to re-attach tenants");
+        // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
+        match client.re_attach().await {
            Ok(tenants) => tenants,
            Err(RetryForeverError::ShuttingDown) => {
                anyhow::bail!("Shut down while waiting for control plane re-attach response")
            }
-        };
-
-        // The deletion queue needs to know about the startup attachment state to decide which (if any) stored
-        // deletion list entries may still be valid.  We provide that by pushing a recovery operation into
-        // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
-        // are processed, even though we don't block on recovery completing here.
-        //
-        // Must only do this if remote storage is enabled, otherwise deletion queue
-        // is not running and channel push will fail.
-        if resources.remote_storage.is_some() {
-            resources
-                .deletion_queue_client
-                .recover(result.clone())
-                .await?;
        }
-
-        Some(result)
    } else {
        info!("Control plane API not configured, tenant generations are disabled");
-        None
+        return Ok(None);
    };

+    // The deletion queue needs to know about the startup attachment state to decide which (if any) stored
+    // deletion list entries may still be valid.  We provide that by pushing a recovery operation into
+    // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
+    // are processed, even though we don't block on recovery completing here.
+    //
+    // Must only do this if remote storage is enabled, otherwise deletion queue
+    // is not running and channel push will fail.
+    if resources.remote_storage.is_some() {
+        resources
+            .deletion_queue_client
+            .recover(generations.clone())?;
+    }
+
+    Ok(Some(generations))
+}
+
+/// Initial stage of load: walk the local tenants directory, clean up any temp files,
+/// and load configurations for the tenants we found.
+async fn init_load_tenant_configs(
+    conf: &'static PageServerConf,
+) -> anyhow::Result<HashMap<TenantId, anyhow::Result<LocationConf>>> {
+    let tenants_dir = conf.tenants_path();
+
    let mut dir_entries = tenants_dir
        .read_dir_utf8()
        .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;

-    let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);
+    let mut configs = HashMap::new();

    loop {
        match dir_entries.next() {
            None => break,
-            Some(Ok(dir_entry)) => {
-                let tenant_dir_path = dir_entry.path().to_path_buf();
+            Some(Ok(dentry)) => {
+                let tenant_dir_path = dentry.path().to_path_buf();
                if crate::is_temporary(&tenant_dir_path) {
                    info!("Found temporary tenant directory, removing: {tenant_dir_path}");
                    // No need to use safe_remove_tenant_dir_all because this is already
@@ -216,141 +240,158 @@ pub async fn init_tenant_mgr(
                            tenant_dir_path, e
                        );
                    }
-                } else {
-                    // This case happens if we:
-                    // * crash during attach before creating the attach marker file
-                    // * crash during tenant delete before removing tenant directory
-                    let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
-                        format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
-                    })?;
-                    if is_empty {
-                        info!("removing empty tenant directory {tenant_dir_path:?}");
-                        if let Err(e) = fs::remove_dir(&tenant_dir_path).await {
-                            error!(
-                                "Failed to remove empty tenant directory '{}': {e:#}",
-                                tenant_dir_path
-                            )
-                        }
-                        continue;
-                    }
-
-                    let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
-                    if tenant_ignore_mark_file.exists() {
-                        info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
-                        continue;
-                    }
-
-                    let tenant_id = match tenant_dir_path
-                        .file_name()
-                        .unwrap_or_default()
-                        .parse::<TenantId>()
-                    {
-                        Ok(id) => id,
-                        Err(_) => {
-                            warn!(
-                                "Invalid tenant path (garbage in our repo directory?): {}",
-                                tenant_dir_path
-                            );
-                            continue;
-                        }
-                    };
-
-                    // Try loading the location configuration
-                    let mut location_conf = match Tenant::load_tenant_config(conf, &tenant_id)
-                        .context("load tenant config")
-                    {
-                        Ok(c) => c,
-                        Err(e) => {
-                            warn!("Marking tenant broken, failed to {e:#}");
-
-                            tenants.insert(
-                                tenant_id,
-                                TenantSlot::Attached(Tenant::create_broken_tenant(
-                                    conf,
-                                    tenant_id,
-                                    "error loading tenant location configuration".to_string(),
-                                )),
-                            );
-
-                            continue;
-                        }
-                    };
-
-                    let generation = if let Some(generations) = &tenant_generations {
-                        // We have a generation map: treat it as the authority for whether
-                        // this tenant is really attached.
-                        if let Some(gen) = generations.get(&tenant_id) {
-                            *gen
-                        } else {
-                            match &location_conf.mode {
-                                LocationMode::Secondary(_) => {
-                                    // We do not require the control plane's permission for secondary mode
-                                    // tenants, because they do no remote writes and hence require no
-                                    // generation number
-                                    info!("Loaded tenant {tenant_id} in secondary mode");
-                                    tenants.insert(tenant_id, TenantSlot::Secondary);
-                                }
-                                LocationMode::Attached(_) => {
-                                    // TODO: augment re-attach API to enable the control plane to
-                                    // instruct us about secondary attachments.  That way, instead of throwing
-                                    // away local state, we can gracefully fall back to secondary here, if the control
-                                    // plane tells us so.
-                                    // (https://github.com/neondatabase/neon/issues/5377)
-                                    info!("Detaching tenant {tenant_id}, control plane omitted it in re-attach response");
-                                    if let Err(e) =
-                                        safe_remove_tenant_dir_all(&tenant_dir_path).await
-                                    {
-                                        error!(
-                                            "Failed to remove detached tenant directory '{}': {:?}",
-                                            tenant_dir_path, e
-                                        );
-                                    }
-                                }
-                            };
-
-                            continue;
-                        }
-                    } else {
-                        // Legacy mode: no generation information, any tenant present
-                        // on local disk may activate
-                        info!(
-                            "Starting tenant {} in legacy mode, no generation",
-                            tenant_dir_path
-                        );
-                        Generation::none()
-                    };
-
-                    // Presence of a generation number implies attachment: attach the tenant
-                    // if it wasn't already, and apply the generation number.
-                    location_conf.attach_in_generation(generation);
-                    Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;
-
-                    match schedule_local_tenant_processing(
-                        conf,
-                        tenant_id,
-                        &tenant_dir_path,
-                        AttachedTenantConf::try_from(location_conf)?,
-                        resources.clone(),
-                        Some(init_order.clone()),
-                        &TENANTS,
-                        &ctx,
-                    ) {
-                        Ok(tenant) => {
-                            tenants.insert(tenant.tenant_id(), TenantSlot::Attached(tenant));
-                        }
-                        Err(e) => {
-                            error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}");
-                        }
-                    }
+                    continue;
                }
+
+                // This case happens if we:
+                // * crash during attach before creating the attach marker file
+                // * crash during tenant delete before removing tenant directory
+                let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
+                    format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
+                })?;
+                if is_empty {
+                    info!("removing empty tenant directory {tenant_dir_path:?}");
+                    if let Err(e) = fs::remove_dir(&tenant_dir_path).await {
+                        error!(
+                            "Failed to remove empty tenant directory '{}': {e:#}",
+                            tenant_dir_path
+                        )
+                    }
+                    continue;
+                }
+
+                let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
+                if tenant_ignore_mark_file.exists() {
+                    info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
+                    continue;
+                }
+
+                let tenant_id = match tenant_dir_path
+                    .file_name()
+                    .unwrap_or_default()
+                    .parse::<TenantId>()
+                {
+                    Ok(id) => id,
+                    Err(_) => {
+                        warn!(
+                            "Invalid tenant path (garbage in our repo directory?): {tenant_dir_path}",
+                        );
+                        continue;
+                    }
+                };
+
+                configs.insert(tenant_id, Tenant::load_tenant_config(conf, &tenant_id));
            }
            Some(Err(e)) => {
-                // On error, print it, but continue with the other tenants. If we error out
-                // here, the pageserver startup fails altogether, causing outage for *all*
-                // tenants. That seems worse.
-                error!(
-                    "Failed to list tenants dir entry in directory {tenants_dir:?}, reason: {e:?}"
+                // An error listing the top level directory indicates serious problem
+                // with local filesystem: we will fail to load, and fail to start.
+                anyhow::bail!(e);
+            }
+        }
+    }
+    Ok(configs)
+}
+
+/// Initialize repositories with locally available timelines.
+/// Timelines that are only partially available locally (remote storage has more data than this pageserver)
+/// are scheduled for download and added to the tenant once download is completed.
+#[instrument(skip_all)]
+pub async fn init_tenant_mgr(
+    conf: &'static PageServerConf,
+    resources: TenantSharedResources,
+    init_order: InitializationOrder,
+    cancel: CancellationToken,
+) -> anyhow::Result<()> {
+    let mut tenants = HashMap::new();
+
+    let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);
+
+    // Scan local filesystem for attached tenants
+    let tenant_configs = init_load_tenant_configs(conf).await?;
+
+    // Determine which tenants are to be attached
+    let tenant_generations =
+        init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
+
+    // Construct `Tenant` objects and start them running
+    for (tenant_id, location_conf) in tenant_configs {
+        let tenant_dir_path = conf.tenant_path(&tenant_id);
+
+        let mut location_conf = match location_conf {
+            Ok(l) => l,
+            Err(e) => {
+                warn!(%tenant_id, "Marking tenant broken, failed to {e:#}");
+
+                tenants.insert(
+                    tenant_id,
+                    TenantSlot::Attached(Tenant::create_broken_tenant(
+                        conf,
+                        tenant_id,
+                        format!("{}", e),
+                    )),
                );
+                continue;
+            }
+        };
+
+        let generation = if let Some(generations) = &tenant_generations {
+            // We have a generation map: treat it as the authority for whether
+            // this tenant is really attached.
+            if let Some(gen) = generations.get(&tenant_id) {
+                *gen
+            } else {
+                match &location_conf.mode {
+                    LocationMode::Secondary(_) => {
+                        // We do not require the control plane's permission for secondary mode
+                        // tenants, because they do no remote writes and hence require no
+                        // generation number
+                        info!(%tenant_id, "Loaded tenant in secondary mode");
+                        tenants.insert(tenant_id, TenantSlot::Secondary);
+                    }
+                    LocationMode::Attached(_) => {
+                        // TODO: augment re-attach API to enable the control plane to
+                        // instruct us about secondary attachments.  That way, instead of throwing
+                        // away local state, we can gracefully fall back to secondary here, if the control
+                        // plane tells us so.
+                        // (https://github.com/neondatabase/neon/issues/5377)
+                        info!(%tenant_id, "Detaching tenant, control plane omitted it in re-attach response");
+                        if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
+                            error!(%tenant_id,
+                                "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
+                            );
+                        }
+                    }
+                };
+
+                continue;
+            }
+        } else {
+            // Legacy mode: no generation information, any tenant present
+            // on local disk may activate
+            info!(%tenant_id, "Starting tenant in legacy mode, no generation",);
+            Generation::none()
+        };
+
+        // Presence of a generation number implies attachment: attach the tenant
+        // if it wasn't already, and apply the generation number.
+        location_conf.attach_in_generation(generation);
+        Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;
+
+        match schedule_local_tenant_processing(
+            conf,
+            tenant_id,
+            &tenant_dir_path,
+            AttachedTenantConf::try_from(location_conf)?,
+            resources.clone(),
+            Some(init_order.clone()),
+            &TENANTS,
+            &ctx,
+        ) {
+            Ok(tenant) => {
+                tenants.insert(tenant.tenant_id(), TenantSlot::Attached(tenant));
+            }
+            Err(e) => {
+                error!(%tenant_id, "Failed to start tenant: {e:#}");
            }
        }
    }
@@ -653,6 +694,18 @@ pub(crate) async fn upsert_location(

    if let Some(tenant) = shutdown_tenant {
        let (_guard, progress) = utils::completion::channel();
+
+        match tenant.get_attach_mode() {
+            AttachmentMode::Single | AttachmentMode::Multi => {
+                // Before we leave our state as the presumed holder of the latest generation,
+                // flush any outstanding deletions to reduce the risk of leaking objects.
+                deletion_queue_client.flush_advisory()
+            }
+            AttachmentMode::Stale => {
+                // If we're stale there's not point trying to flush deletions
+            }
+        };
+
        info!("Shutting down attached tenant");
        match tenant.shutdown(progress, false).await {
            Ok(()) => {}
@@ -807,8 +860,16 @@ pub async fn detach_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
    detach_ignored: bool,
+    deletion_queue_client: &DeletionQueueClient,
 ) -> Result<(), TenantStateError> {
-    let tmp_path = detach_tenant0(conf, &TENANTS, tenant_id, detach_ignored).await?;
+    let tmp_path = detach_tenant0(
+        conf,
+        &TENANTS,
+        tenant_id,
+        detach_ignored,
+        deletion_queue_client,
+    )
+    .await?;
    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
    let task_tenant_id = None;
@@ -833,6 +894,7 @@ async fn detach_tenant0(
    tenants: &tokio::sync::RwLock<TenantsMap>,
    tenant_id: TenantId,
    detach_ignored: bool,
+    deletion_queue_client: &DeletionQueueClient,
 ) -> Result<Utf8PathBuf, TenantStateError> {
    let tenant_dir_rename_operation = |tenant_id_to_clean| async move {
        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
@@ -844,6 +906,10 @@ async fn detach_tenant0(
    let removal_result =
        remove_tenant_from_memory(tenants, tenant_id, tenant_dir_rename_operation(tenant_id)).await;

+    // Flush pending deletions, so that they have a good chance of passing validation
+    // before this tenant is potentially re-attached elsewhere.
+    deletion_queue_client.flush_advisory();
+
    // Ignored tenants are not present in memory and will bail the removal from memory operation.
    // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
    if detach_ignored && matches!(removal_result, Err(TenantStateError::NotFound(_))) {
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -901,9 +901,27 @@ impl RemoteTimelineClient {
        .await
        .context("list prefixes")?;

-        let remaining: Vec<RemotePath> = remaining
+        // We will delete the current index_part object last, since it acts as a deletion
+        // marker via its deleted_at attribute
+        let latest_index = remaining
+            .iter()
+            .filter(|p| {
+                p.object_name()
+                    .map(|n| n.starts_with(IndexPart::FILE_NAME))
+                    .unwrap_or(false)
+            })
+            .filter_map(|path| parse_remote_index_path(path.clone()).map(|gen| (path, gen)))
+            .max_by_key(|i| i.1)
+            .map(|i| i.0.clone())
+            .unwrap_or(
+                // No generation-suffixed indices, assume we are dealing with
+                // a legacy index.
+                remote_index_path(&self.tenant_id, &self.timeline_id, Generation::none()),
+            );
+
+        let remaining_layers: Vec<RemotePath> = remaining
            .into_iter()
-            .filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
+            .filter(|p| p!= &latest_index)
            .inspect(|path| {
                if let Some(name) = path.object_name() {
                    info!(%name, "deleting a file not referenced from index_part.json");
@@ -913,9 +931,11 @@ impl RemoteTimelineClient {
            })
            .collect();

-        let not_referenced_count = remaining.len();
-        if !remaining.is_empty() {
-            self.deletion_queue_client.push_immediate(remaining).await?;
+        let not_referenced_count = remaining_layers.len();
+        if !remaining_layers.is_empty() {
+            self.deletion_queue_client
+                .push_immediate(remaining_layers)
+                .await?;
        }

        fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -924,11 +944,9 @@ impl RemoteTimelineClient {
            ))?
        });

-        let index_file_path = timeline_storage_path.join(Utf8Path::new(IndexPart::FILE_NAME));
-
        debug!("enqueuing index part deletion");
        self.deletion_queue_client
-            .push_immediate([index_file_path].to_vec())
+            .push_immediate([latest_index].to_vec())
            .await?;

        // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -31,6 +31,7 @@ pub(super) async fn upload_index_part<'a>(
    fail_point!("before-upload-index", |_| {
        bail!("failpoint before-upload-index")
    });
+    pausable_failpoint!("before-upload-index-pausable");

    let index_part_bytes =
        serde_json::to_vec(&index_part).context("serialize index part file into bytes")?;
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -511,8 +511,7 @@ impl DeltaLayer {
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
    pub fn new_for_path(path: &Utf8Path, file: File) -> Result<Self> {
-        let mut summary_buf = Vec::new();
-        summary_buf.resize(PAGE_SZ, 0);
+        let mut summary_buf = vec![0; PAGE_SZ];
        file.read_exact_at(&mut summary_buf, 0)?;
        let summary = Summary::des_prefix(&summary_buf)?;

@@ -864,11 +863,11 @@ impl DeltaLayerInner {
            expected_summary.index_start_blk = actual_summary.index_start_blk;
            expected_summary.index_root_blk = actual_summary.index_root_blk;
            if actual_summary != expected_summary {
-                // bail!(
-                //     "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
-                //     actual_summary,
-                //     expected_summary
-                // );
+                bail!(
+                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
+                    actual_summary,
+                    expected_summary
+                );
            }
        }

--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -400,8 +400,7 @@ impl ImageLayer {
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
    pub fn new_for_path(path: &Utf8Path, file: File) -> Result<ImageLayer> {
-        let mut summary_buf = Vec::new();
-        summary_buf.resize(PAGE_SZ, 0);
+        let mut summary_buf = vec![0; PAGE_SZ];
        file.read_exact_at(&mut summary_buf, 0)?;
        let summary = Summary::des_prefix(&summary_buf)?;
        let metadata = file
@@ -457,11 +456,11 @@ impl ImageLayerInner {
            expected_summary.index_root_blk = actual_summary.index_root_blk;

            if actual_summary != expected_summary {
-                // bail!(
-                //     "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
-                //     actual_summary,
-                //     expected_summary
-                // );
+                bail!(
+                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
+                    actual_summary,
+                    expected_summary
+                );
            }
        }

--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -14,73 +14,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::completion;

-static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
-    once_cell::sync::Lazy::new(|| {
-        let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
-        let permits = usize::max(
-            1,
-            // while a lot of the work is done on spawn_blocking, we still do
-            // repartitioning in the async context. this should give leave us some workers
-            // unblocked to be blocked on other work, hopefully easing any outside visible
-            // effects of restarts.
-            //
-            // 6/8 is a guess; previously we ran with unlimited 8 and more from
-            // spawn_blocking.
-            (total_threads * 3).checked_div(4).unwrap_or(0),
-        );
-        assert_ne!(permits, 0, "we will not be adding in permits later");
-        assert!(
-            permits < total_threads,
-            "need threads avail for shorter work"
-        );
-        tokio::sync::Semaphore::new(permits)
-    });
-
-#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr)]
-#[strum(serialize_all = "snake_case")]
-pub(crate) enum BackgroundLoopKind {
-    Compaction,
-    Gc,
-    Eviction,
-    ConsumptionMetricsCollectMetrics,
-    ConsumptionMetricsSyntheticSizeWorker,
-}
-
-impl BackgroundLoopKind {
-    fn as_static_str(&self) -> &'static str {
-        let s: &'static str = self.into();
-        s
-    }
-}
-
-pub(crate) enum RateLimitError {
-    Cancelled,
-}
-
-pub(crate) async fn concurrent_background_tasks_rate_limit(
-    loop_kind: BackgroundLoopKind,
-    _ctx: &RequestContext,
-    cancel: &CancellationToken,
-) -> Result<impl Drop, RateLimitError> {
-    crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT
-        .with_label_values(&[loop_kind.as_static_str()])
-        .inc();
-    scopeguard::defer!(
-        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc();
-    );
-    tokio::select! {
-        permit = CONCURRENT_BACKGROUND_TASKS.acquire() => {
-            match permit {
-                Ok(permit) => Ok(permit),
-                Err(_closed) => unreachable!("we never close the semaphore"),
-            }
-        },
-        _ = cancel.cancelled() => {
-            Err(RateLimitError::Cancelled)
-        }
-    }
-}
-
 /// Start per tenant background loops: compaction and gc.
 pub fn start_background_loops(
    tenant: &Arc<Tenant>,
@@ -183,7 +116,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            };

-            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
+            warn_when_period_overrun(started_at.elapsed(), period, "compaction");

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -251,7 +184,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            };

-            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
+            warn_when_period_overrun(started_at.elapsed(), period, "gc");

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -325,11 +258,7 @@ pub(crate) async fn random_init_delay(
 }

 /// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
-pub(crate) fn warn_when_period_overrun(
-    elapsed: Duration,
-    period: Duration,
-    task: BackgroundLoopKind,
-) {
+pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
    // Duration::ZERO will happen because it's the "disable [bgtask]" value.
    if elapsed >= period && period != Duration::ZERO {
        // humantime does no significant digits clamping whereas Duration's debug is a bit more
@@ -338,11 +267,11 @@ pub(crate) fn warn_when_period_overrun(
        warn!(
            ?elapsed,
            period = %humantime::format_duration(period),
-            ?task,
+            task,
            "task iteration took longer than the configured period"
        );
        crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
-            .with_label_values(&[task.as_static_str(), &format!("{}", period.as_secs())])
+            .with_label_values(&[task, &format!("{}", period.as_secs())])
            .inc();
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -44,7 +44,6 @@ use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
    DeltaLayerWriter, ImageLayerWriter, InMemoryLayer, LayerAccessStats, LayerFileName, RemoteLayer,
 };
-use crate::tenant::tasks::{BackgroundLoopKind, RateLimitError};
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
    layer_map::{LayerMap, SearchResult},
@@ -685,17 +684,37 @@ impl Timeline {
    ) -> anyhow::Result<()> {
        const ROUNDS: usize = 2;

+        static CONCURRENT_COMPACTIONS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
+            once_cell::sync::Lazy::new(|| {
+                let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
+                let permits = usize::max(
+                    1,
+                    // while a lot of the work is done on spawn_blocking, we still do
+                    // repartitioning in the async context. this should give leave us some workers
+                    // unblocked to be blocked on other work, hopefully easing any outside visible
+                    // effects of restarts.
+                    //
+                    // 6/8 is a guess; previously we ran with unlimited 8 and more from
+                    // spawn_blocking.
+                    (total_threads * 3).checked_div(4).unwrap_or(0),
+                );
+                assert_ne!(permits, 0, "we will not be adding in permits later");
+                assert!(
+                    permits < total_threads,
+                    "need threads avail for shorter work"
+                );
+                tokio::sync::Semaphore::new(permits)
+            });
+
        // this wait probably never needs any "long time spent" logging, because we already nag if
        // compaction task goes over it's period (20s) which is quite often in production.
-        let _permit = match super::tasks::concurrent_background_tasks_rate_limit(
-            BackgroundLoopKind::Compaction,
-            ctx,
-            cancel,
-        )
-        .await
-        {
-            Ok(permit) => permit,
-            Err(RateLimitError::Cancelled) => return Ok(()),
+        let _permit = tokio::select! {
+            permit = CONCURRENT_COMPACTIONS.acquire() => {
+                permit
+            },
+            _ = cancel.cancelled() => {
+                return Ok(());
+            }
        };

        let last_record_lsn = self.get_last_record_lsn();
@@ -2344,7 +2363,7 @@ impl Timeline {
                // during branch creation.
                match ancestor.wait_to_become_active(ctx).await {
                    Ok(()) => {}
-                    Err(state) if state == TimelineState::Stopping => {
+                    Err(TimelineState::Stopping) => {
                        return Err(PageReconstructError::AncestorStopping(ancestor.timeline_id));
                    }
                    Err(state) => {
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,7 +30,6 @@ use crate::{
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
        storage_layer::PersistentLayer,
-        tasks::{BackgroundLoopKind, RateLimitError},
        timeline::EvictionError,
        LogicalSizeCalculationCause, Tenant,
    },
@@ -130,11 +129,7 @@ impl Timeline {
                    ControlFlow::Continue(()) => (),
                }
                let elapsed = start.elapsed();
-                crate::tenant::tasks::warn_when_period_overrun(
-                    elapsed,
-                    p.period,
-                    BackgroundLoopKind::Eviction,
-                );
+                crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction");
                crate::metrics::EVICTION_ITERATION_DURATION
                    .get_metric_with_label_values(&[
                        &format!("{}", p.period.as_secs()),
@@ -155,17 +150,6 @@ impl Timeline {
    ) -> ControlFlow<()> {
        let now = SystemTime::now();

-        let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit(
-            BackgroundLoopKind::Eviction,
-            ctx,
-            cancel,
-        )
-        .await
-        {
-            Ok(permit) => permit,
-            Err(RateLimitError::Cancelled) => return ControlFlow::Break(()),
-        };
-
        // If we evict layers but keep cached values derived from those layers, then
        // we face a storm of on-demand downloads after pageserver restart.
        // The reason is that the restart empties the caches, and so, the values
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -18,8 +18,7 @@ use std::fs::{self, File, OpenOptions};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
-use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
-use tokio::time::Instant;
+use std::sync::{RwLock, RwLockWriteGuard};

 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
@@ -111,7 +110,7 @@ impl OpenFiles {
    ///
    /// On return, we hold a lock on the slot, and its 'tag' has been updated
    /// recently_used has been set. It's all ready for reuse.
-    async fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
+    fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
        //
        // Run the clock algorithm to find a slot to replace.
        //
@@ -143,7 +142,7 @@ impl OpenFiles {
                }
                retries += 1;
            } else {
-                slot_guard = slot.inner.write().await;
+                slot_guard = slot.inner.write().unwrap();
                index = next;
                break;
            }
@@ -154,7 +153,7 @@ impl OpenFiles {
        // old file.
        //
        if let Some(old_file) = slot_guard.file.take() {
-            // the normal path of dropping VirtualFile uses `Close`, use `CloseByReplace` here to
+            // the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
            // distinguish the two.
            STORAGE_IO_TIME_METRIC
                .get(StorageIoOperation::CloseByReplace)
@@ -209,29 +208,6 @@ impl CrashsafeOverwriteError {
    }
 }

-/// Observe duration for the given storage I/O operation
-///
-/// Unlike `observe_closure_duration`, this supports async,
-/// where "support" means that we measure wall clock time.
-macro_rules! observe_duration {
-    ($op:expr, $($body:tt)*) => {{
-        let instant = Instant::now();
-        let result = $($body)*;
-        let elapsed = instant.elapsed().as_secs_f64();
-        STORAGE_IO_TIME_METRIC
-            .get($op)
-            .observe(elapsed);
-        result
-    }}
-}
-
-macro_rules! with_file {
-    ($this:expr, $op:expr, | $ident:ident | $($body:tt)*) => {{
-        let $ident = $this.lock_file().await?;
-        observe_duration!($op, $($body)*)
-    }};
-}
-
 impl VirtualFile {
    /// Open a file in read-only mode. Like File::open.
    pub async fn open(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
@@ -268,9 +244,11 @@ impl VirtualFile {
            tenant_id = "*".to_string();
            timeline_id = "*".to_string();
        }
-        let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;
+        let (handle, mut slot_guard) = get_open_files().find_victim_slot();

-        let file = observe_duration!(StorageIoOperation::Open, open_options.open(path))?;
+        let file = STORAGE_IO_TIME_METRIC
+            .get(StorageIoOperation::Open)
+            .observe_closure_duration(|| open_options.open(path))?;

        // Strip all options other than read and write.
        //
@@ -353,24 +331,22 @@ impl VirtualFile {

    /// Call File::sync_all() on the underlying File.
    pub async fn sync_all(&self) -> Result<(), Error> {
-        with_file!(self, StorageIoOperation::Fsync, |file| file
-            .as_ref()
-            .sync_all())
+        self.with_file(StorageIoOperation::Fsync, |file| file.sync_all())
+            .await?
    }

    pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
-        with_file!(self, StorageIoOperation::Metadata, |file| file
-            .as_ref()
-            .metadata())
+        self.with_file(StorageIoOperation::Metadata, |file| file.metadata())
+            .await?
    }

-    /// Helper function internal to `VirtualFile` that looks up the underlying File,
-    /// opens it and evicts some other File if necessary. The passed parameter is
-    /// assumed to be a function available for the physical `File`.
-    ///
-    /// We are doing it via a macro as Rust doesn't support async closures that
-    /// take on parameters with lifetimes.
-    async fn lock_file(&self) -> Result<FileGuard<'_>, Error> {
+    /// Helper function that looks up the underlying File for this VirtualFile,
+    /// opening it and evicting some other File if necessary. It calls 'func'
+    /// with the physical File.
+    async fn with_file<F, R>(&self, op: StorageIoOperation, mut func: F) -> Result<R, Error>
+    where
+        F: FnMut(&File) -> R,
+    {
        let open_files = get_open_files();

        let mut handle_guard = {
@@ -380,23 +356,27 @@ impl VirtualFile {
            // We only need to hold the handle lock while we read the current handle. If
            // another thread closes the file and recycles the slot for a different file,
            // we will notice that the handle we read is no longer valid and retry.
-            let mut handle = *self.handle.read().await;
+            let mut handle = *self.handle.read().unwrap();
            loop {
                // Check if the slot contains our File
                {
                    let slot = &open_files.slots[handle.index];
-                    let slot_guard = slot.inner.read().await;
-                    if slot_guard.tag == handle.tag && slot_guard.file.is_some() {
-                        // Found a cached file descriptor.
-                        slot.recently_used.store(true, Ordering::Relaxed);
-                        return Ok(FileGuard { slot_guard });
+                    let slot_guard = slot.inner.read().unwrap();
+                    if slot_guard.tag == handle.tag {
+                        if let Some(file) = &slot_guard.file {
+                            // Found a cached file descriptor.
+                            slot.recently_used.store(true, Ordering::Relaxed);
+                            return Ok(STORAGE_IO_TIME_METRIC
+                                .get(op)
+                                .observe_closure_duration(|| func(file)));
+                        }
                    }
                }

                // The slot didn't contain our File. We will have to open it ourselves,
                // but before that, grab a write lock on handle in the VirtualFile, so
                // that no other thread will try to concurrently open the same file.
-                let handle_guard = self.handle.write().await;
+                let handle_guard = self.handle.write().unwrap();

                // If another thread changed the handle while we were not holding the lock,
                // then the handle might now be valid again. Loop back to retry.
@@ -410,10 +390,17 @@ impl VirtualFile {

        // We need to open the file ourselves. The handle in the VirtualFile is
        // now locked in write-mode. Find a free slot to put it in.
-        let (handle, mut slot_guard) = open_files.find_victim_slot().await;
+        let (handle, mut slot_guard) = open_files.find_victim_slot();

        // Open the physical file
-        let file = observe_duration!(StorageIoOperation::Open, self.open_options.open(&self.path))?;
+        let file = STORAGE_IO_TIME_METRIC
+            .get(StorageIoOperation::Open)
+            .observe_closure_duration(|| self.open_options.open(&self.path))?;
+
+        // Perform the requested operation on it
+        let result = STORAGE_IO_TIME_METRIC
+            .get(op)
+            .observe_closure_duration(|| func(&file));

        // Store the File in the slot and update the handle in the VirtualFile
        // to point to it.
@@ -421,9 +408,7 @@ impl VirtualFile {

        *handle_guard = handle;

-        return Ok(FileGuard {
-            slot_guard: slot_guard.downgrade(),
-        });
+        Ok(result)
    }

    pub fn remove(self) {
@@ -438,9 +423,11 @@ impl VirtualFile {
                self.pos = offset;
            }
            SeekFrom::End(offset) => {
-                self.pos = with_file!(self, StorageIoOperation::Seek, |file| file
-                    .as_ref()
-                    .seek(SeekFrom::End(offset)))?
+                self.pos = self
+                    .with_file(StorageIoOperation::Seek, |mut file| {
+                        file.seek(SeekFrom::End(offset))
+                    })
+                    .await??
            }
            SeekFrom::Current(offset) => {
                let pos = self.pos as i128 + offset as i128;
@@ -528,9 +515,9 @@ impl VirtualFile {
    }

    pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
-        let result = with_file!(self, StorageIoOperation::Read, |file| file
-            .as_ref()
-            .read_at(buf, offset));
+        let result = self
+            .with_file(StorageIoOperation::Read, |file| file.read_at(buf, offset))
+            .await?;
        if let Ok(size) = result {
            STORAGE_IO_SIZE
                .with_label_values(&["read", &self.tenant_id, &self.timeline_id])
@@ -540,9 +527,9 @@ impl VirtualFile {
    }

    async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
-        let result = with_file!(self, StorageIoOperation::Write, |file| file
-            .as_ref()
-            .write_at(buf, offset));
+        let result = self
+            .with_file(StorageIoOperation::Write, |file| file.write_at(buf, offset))
+            .await?;
        if let Ok(size) = result {
            STORAGE_IO_SIZE
                .with_label_values(&["write", &self.tenant_id, &self.timeline_id])
@@ -552,18 +539,6 @@ impl VirtualFile {
    }
 }

-struct FileGuard<'a> {
-    slot_guard: RwLockReadGuard<'a, SlotInner>,
-}
-
-impl<'a> AsRef<File> for FileGuard<'a> {
-    fn as_ref(&self) -> &File {
-        // This unwrap is safe because we only create `FileGuard`s
-        // if we know that the file is Some.
-        self.slot_guard.file.as_ref().unwrap()
-    }
-}
-
 #[cfg(test)]
 impl VirtualFile {
    pub(crate) async fn read_blk(
@@ -596,39 +571,20 @@ impl VirtualFile {
 impl Drop for VirtualFile {
    /// If a VirtualFile is dropped, close the underlying file if it was open.
    fn drop(&mut self) {
-        let handle = self.handle.get_mut();
+        let handle = self.handle.get_mut().unwrap();

-        fn clean_slot(slot: &Slot, mut slot_guard: RwLockWriteGuard<'_, SlotInner>, tag: u64) {
-            if slot_guard.tag == tag {
-                slot.recently_used.store(false, Ordering::Relaxed);
-                // there is also the `CloseByReplace` operation for closes done on eviction for
-                // comparison.
-                STORAGE_IO_TIME_METRIC
-                    .get(StorageIoOperation::Close)
-                    .observe_closure_duration(|| drop(slot_guard.file.take()));
-            }
-        }
-
-        // We don't have async drop so we cannot directly await the lock here.
-        // Instead, first do a best-effort attempt at closing the underlying
-        // file descriptor by using `try_write`, and if that fails, spawn
-        // a tokio task to do it asynchronously: we just want it to be
-        // cleaned up eventually.
-        // Most of the time, the `try_lock` should succeed though,
-        // as we have `&mut self` access. In other words, if the slot
-        // is still occupied by our file, there should be no access from
-        // other I/O operations; the only other possible place to lock
-        // the slot is the lock algorithm looking for free slots.
+        // We could check with a read-lock first, to avoid waiting on an
+        // unrelated I/O.
        let slot = &get_open_files().slots[handle.index];
-        if let Ok(slot_guard) = slot.inner.try_write() {
-            clean_slot(slot, slot_guard, handle.tag);
-        } else {
-            let tag = handle.tag;
-            tokio::spawn(async move {
-                let slot_guard = slot.inner.write().await;
-                clean_slot(slot, slot_guard, tag);
-            });
-        };
+        let mut slot_guard = slot.inner.write().unwrap();
+        if slot_guard.tag == handle.tag {
+            slot.recently_used.store(false, Ordering::Relaxed);
+            // there is also operation "close-by-replace" for closes done on eviction for
+            // comparison.
+            STORAGE_IO_TIME_METRIC
+                .get(StorageIoOperation::Close)
+                .observe_closure_duration(|| drop(slot_guard.file.take()));
+        }
    }
 }

--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -825,7 +825,7 @@ impl PostgresRedoManager {
        while nwrite < writebuf.len() {
            let n = loop {
                match nix::poll::poll(&mut pollfds[0..2], wal_redo_timeout.as_millis() as i32) {
-                    Err(e) if e == nix::errno::Errno::EINTR => continue,
+                    Err(nix::errno::Errno::EINTR) => continue,
                    res => break res,
                }
            }?;
@@ -917,7 +917,7 @@ impl PostgresRedoManager {
                // and forward any logging information that the child writes to its stderr to the page server's log.
                let n = loop {
                    match nix::poll::poll(&mut pollfds[1..3], wal_redo_timeout.as_millis() as i32) {
-                        Err(e) if e == nix::errno::Errno::EINTR => continue,
+                        Err(nix::errno::Errno::EINTR) => continue,
                        res => break res,
                    }
                }?;
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -7,12 +7,12 @@ OBJS = \
 	extension_server.o \
 	file_cache.o \
 	libpagestore.o \
-	libpqwalproposer.o \
 	neon.o \
+	neon_utils.o \
 	pagestore_smgr.o \
 	relsize_cache.o \
 	walproposer.o \
-	walproposer_utils.o \
+	walproposer_pg.o \
 	control_plane_connector.o

 PG_CPPFLAGS = -I$(libpq_srcdir)
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -30,7 +30,7 @@

 #include "neon.h"
 #include "walproposer.h"
-#include "walproposer_utils.h"
+#include "neon_utils.h"

 #define PageStoreTrace DEBUG5

--- a/pgxn/neon/libpqwalproposer.c
+++ b/pgxn/neon/libpqwalproposer.c
@@ -1,424 +0,0 @@
-#include "postgres.h"
-
-#include "libpq-fe.h"
-#include "neon.h"
-#include "walproposer.h"
-
-/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
-struct WalProposerConn
-{
-	PGconn	   *pg_conn;
-	bool		is_nonblocking; /* whether the connection is non-blocking */
-	char	   *recvbuf;		/* last received data from
-								 * walprop_async_read */
-};
-
-/* Helper function */
-static bool
-ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
-{
-	/* If we're already correctly blocking or nonblocking, all good */
-	if (is_nonblocking == conn->is_nonblocking)
-		return true;
-
-	/* Otherwise, set it appropriately */
-	if (PQsetnonblocking(conn->pg_conn, is_nonblocking) == -1)
-		return false;
-
-	conn->is_nonblocking = is_nonblocking;
-	return true;
-}
-
-/* Exported function definitions */
-char *
-walprop_error_message(WalProposerConn *conn)
-{
-	return PQerrorMessage(conn->pg_conn);
-}
-
-WalProposerConnStatusType
-walprop_status(WalProposerConn *conn)
-{
-	switch (PQstatus(conn->pg_conn))
-	{
-		case CONNECTION_OK:
-			return WP_CONNECTION_OK;
-		case CONNECTION_BAD:
-			return WP_CONNECTION_BAD;
-		default:
-			return WP_CONNECTION_IN_PROGRESS;
-	}
-}
-
-WalProposerConn *
-walprop_connect_start(char *conninfo, char *password)
-{
-	WalProposerConn *conn;
-	PGconn	   *pg_conn;
-	const char *keywords[3];
-	const char *values[3];
-	int			n;
-
-	/*
-	 * Connect using the given connection string. If the
-	 * NEON_AUTH_TOKEN environment variable was set, use that as
-	 * the password.
-	 *
-	 * The connection options are parsed in the order they're given, so
-	 * when we set the password before the connection string, the
-	 * connection string can override the password from the env variable.
-	 * Seems useful, although we don't currently use that capability
-	 * anywhere.
-	 */
-	n = 0;
-	if (password)
-	{
-		keywords[n] = "password";
-		values[n] = password;
-		n++;
-	}
-	keywords[n] = "dbname";
-	values[n] = conninfo;
-	n++;
-	keywords[n] = NULL;
-	values[n] = NULL;
-	n++;
-	pg_conn = PQconnectStartParams(keywords, values, 1);
-
-	/*
-	 * Allocation of a PQconn can fail, and will return NULL. We want to fully
-	 * replicate the behavior of PQconnectStart here.
-	 */
-	if (!pg_conn)
-		return NULL;
-
-	/*
-	 * And in theory this allocation can fail as well, but it's incredibly
-	 * unlikely if we just successfully allocated a PGconn.
-	 *
-	 * palloc will exit on failure though, so there's not much we could do if
-	 * it *did* fail.
-	 */
-	conn = palloc(sizeof(WalProposerConn));
-	conn->pg_conn = pg_conn;
-	conn->is_nonblocking = false;	/* connections always start in blocking
-									 * mode */
-	conn->recvbuf = NULL;
-	return conn;
-}
-
-WalProposerConnectPollStatusType
-walprop_connect_poll(WalProposerConn *conn)
-{
-	WalProposerConnectPollStatusType return_val;
-
-	switch (PQconnectPoll(conn->pg_conn))
-	{
-		case PGRES_POLLING_FAILED:
-			return_val = WP_CONN_POLLING_FAILED;
-			break;
-		case PGRES_POLLING_READING:
-			return_val = WP_CONN_POLLING_READING;
-			break;
-		case PGRES_POLLING_WRITING:
-			return_val = WP_CONN_POLLING_WRITING;
-			break;
-		case PGRES_POLLING_OK:
-			return_val = WP_CONN_POLLING_OK;
-			break;
-
-			/*
-			 * There's a comment at its source about this constant being
-			 * unused. We'll expect it's never returned.
-			 */
-		case PGRES_POLLING_ACTIVE:
-			elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
-
-			/*
-			 * This return is never actually reached, but it's here to make
-			 * the compiler happy
-			 */
-			return WP_CONN_POLLING_FAILED;
-
-		default:
-			Assert(false);
-			return_val = WP_CONN_POLLING_FAILED;	/* keep the compiler quiet */
-	}
-
-	return return_val;
-}
-
-bool
-walprop_send_query(WalProposerConn *conn, char *query)
-{
-	/*
-	 * We need to be in blocking mode for sending the query to run without
-	 * requiring a call to PQflush
-	 */
-	if (!ensure_nonblocking_status(conn, false))
-		return false;
-
-	/* PQsendQuery returns 1 on success, 0 on failure */
-	if (!PQsendQuery(conn->pg_conn, query))
-		return false;
-
-	return true;
-}
-
-WalProposerExecStatusType
-walprop_get_query_result(WalProposerConn *conn)
-{
-	PGresult   *result;
-	WalProposerExecStatusType return_val;
-
-	/* Marker variable if we need to log an unexpected success result */
-	char	   *unexpected_success = NULL;
-
-	/* Consume any input that we might be missing */
-	if (!PQconsumeInput(conn->pg_conn))
-		return WP_EXEC_FAILED;
-
-	if (PQisBusy(conn->pg_conn))
-		return WP_EXEC_NEEDS_INPUT;
-
-
-	result = PQgetResult(conn->pg_conn);
-
-	/*
-	 * PQgetResult returns NULL only if getting the result was successful &
-	 * there's no more of the result to get.
-	 */
-	if (!result)
-	{
-		elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
-		return WP_EXEC_UNEXPECTED_SUCCESS;
-	}
-
-	/* Helper macro to reduce boilerplate */
-#define UNEXPECTED_SUCCESS(msg) \
-		return_val = WP_EXEC_UNEXPECTED_SUCCESS; \
-		unexpected_success = msg; \
-		break;
-
-
-	switch (PQresultStatus(result))
-	{
-			/* "true" success case */
-		case PGRES_COPY_BOTH:
-			return_val = WP_EXEC_SUCCESS_COPYBOTH;
-			break;
-
-			/* Unexpected success case */
-		case PGRES_EMPTY_QUERY:
-			UNEXPECTED_SUCCESS("empty query return");
-		case PGRES_COMMAND_OK:
-			UNEXPECTED_SUCCESS("data-less command end");
-		case PGRES_TUPLES_OK:
-			UNEXPECTED_SUCCESS("tuples return");
-		case PGRES_COPY_OUT:
-			UNEXPECTED_SUCCESS("'Copy Out' response");
-		case PGRES_COPY_IN:
-			UNEXPECTED_SUCCESS("'Copy In' response");
-		case PGRES_SINGLE_TUPLE:
-			UNEXPECTED_SUCCESS("single tuple return");
-		case PGRES_PIPELINE_SYNC:
-			UNEXPECTED_SUCCESS("pipeline sync point");
-
-			/* Failure cases */
-		case PGRES_BAD_RESPONSE:
-		case PGRES_NONFATAL_ERROR:
-		case PGRES_FATAL_ERROR:
-		case PGRES_PIPELINE_ABORTED:
-			return_val = WP_EXEC_FAILED;
-			break;
-
-		default:
-			Assert(false);
-			return_val = WP_EXEC_FAILED;	/* keep the compiler quiet */
-	}
-
-	if (unexpected_success)
-		elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
-
-	return return_val;
-}
-
-pgsocket
-walprop_socket(WalProposerConn *conn)
-{
-	return PQsocket(conn->pg_conn);
-}
-
-int
-walprop_flush(WalProposerConn *conn)
-{
-	return (PQflush(conn->pg_conn));
-}
-
-void
-walprop_finish(WalProposerConn *conn)
-{
-	if (conn->recvbuf != NULL)
-		PQfreemem(conn->recvbuf);
-	PQfinish(conn->pg_conn);
-	pfree(conn);
-}
-
-/*
- * Receive a message from the safekeeper.
- *
- * On success, the data is placed in *buf. It is valid until the next call
- * to this function.
- */
-PGAsyncReadResult
-walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
-{
-	int			result;
-
-	if (conn->recvbuf != NULL)
-	{
-		PQfreemem(conn->recvbuf);
-		conn->recvbuf = NULL;
-	}
-
-	/* Call PQconsumeInput so that we have the data we need */
-	if (!PQconsumeInput(conn->pg_conn))
-	{
-		*amount = 0;
-		*buf = NULL;
-		return PG_ASYNC_READ_FAIL;
-	}
-
-	/*
-	 * The docs for PQgetCopyData list the return values as: 0 if the copy is
-	 * still in progress, but no "complete row" is available -1 if the copy is
-	 * done -2 if an error occurred (> 0) if it was successful; that value is
-	 * the amount transferred.
-	 *
-	 * The protocol we use between walproposer and safekeeper means that we
-	 * *usually* wouldn't expect to see that the copy is done, but this can
-	 * sometimes be triggered by the server returning an ErrorResponse (which
-	 * also happens to have the effect that the copy is done).
-	 */
-	switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true))
-	{
-		case 0:
-			*amount = 0;
-			*buf = NULL;
-			return PG_ASYNC_READ_TRY_AGAIN;
-		case -1:
-			{
-				/*
-				 * If we get -1, it's probably because of a server error; the
-				 * safekeeper won't normally send a CopyDone message.
-				 *
-				 * We can check PQgetResult to make sure that the server
-				 * failed; it'll always result in PGRES_FATAL_ERROR
-				 */
-				ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
-
-				if (status != PGRES_FATAL_ERROR)
-					elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
-
-				/*
-				 * If there was actually an error, it'll be properly reported
-				 * by calls to PQerrorMessage -- we don't have to do anything
-				 * else
-				 */
-				*amount = 0;
-				*buf = NULL;
-				return PG_ASYNC_READ_FAIL;
-			}
-		case -2:
-			*amount = 0;
-			*buf = NULL;
-			return PG_ASYNC_READ_FAIL;
-		default:
-			/* Positive values indicate the size of the returned result */
-			*amount = result;
-			*buf = conn->recvbuf;
-			return PG_ASYNC_READ_SUCCESS;
-	}
-}
-
-PGAsyncWriteResult
-walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
-{
-	int			result;
-
-	/* If we aren't in non-blocking mode, switch to it. */
-	if (!ensure_nonblocking_status(conn, true))
-		return PG_ASYNC_WRITE_FAIL;
-
-	/*
-	 * The docs for PQputcopyData list the return values as: 1 if the data was
-	 * queued, 0 if it was not queued because of full buffers, or -1 if an
-	 * error occurred
-	 */
-	result = PQputCopyData(conn->pg_conn, buf, size);
-
-	/*
-	 * We won't get a result of zero because walproposer always empties the
-	 * connection's buffers before sending more
-	 */
-	Assert(result != 0);
-
-	switch (result)
-	{
-		case 1:
-			/* good -- continue */
-			break;
-		case -1:
-			return PG_ASYNC_WRITE_FAIL;
-		default:
-			elog(FATAL, "invalid return %d from PQputCopyData", result);
-	}
-
-	/*
-	 * After queueing the data, we still need to flush to get it to send. This
-	 * might take multiple tries, but we don't want to wait around until it's
-	 * done.
-	 *
-	 * PQflush has the following returns (directly quoting the docs): 0 if
-	 * sucessful, 1 if it was unable to send all the data in the send queue
-	 * yet -1 if it failed for some reason
-	 */
-	switch (result = PQflush(conn->pg_conn))
-	{
-		case 0:
-			return PG_ASYNC_WRITE_SUCCESS;
-		case 1:
-			return PG_ASYNC_WRITE_TRY_FLUSH;
-		case -1:
-			return PG_ASYNC_WRITE_FAIL;
-		default:
-			elog(FATAL, "invalid return %d from PQflush", result);
-	}
-}
-
-/*
- * This function is very similar to walprop_async_write. For more
- * information, refer to the comments there.
- */
-bool
-walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size)
-{
-	int			result;
-
-	/* If we are in non-blocking mode, switch out of it. */
-	if (!ensure_nonblocking_status(conn, false))
-		return false;
-
-	if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1)
-		return false;
-
-	Assert(result == 1);
-
-	/* Because the connection is non-blocking, flushing returns 0 or -1 */
-
-	if ((result = PQflush(conn->pg_conn)) == -1)
-		return false;
-
-	Assert(result == 0);
-	return true;
-}
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -18,6 +18,10 @@ extern char *neon_auth_token;
 extern char *neon_timeline;
 extern char *neon_tenant;

+extern char *wal_acceptors_list;
+extern int	wal_acceptor_reconnect_timeout;
+extern int	wal_acceptor_connection_timeout;
+
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);

@@ -30,4 +34,10 @@ extern void pg_init_extension_server(void);
 extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
 extern bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);

+extern uint64 BackpressureThrottlingTime(void);
+extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
+
+extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
+extern void PGDLLEXPORT WalProposerMain(Datum main_arg);
+
 #endif							/* NEON_H */
--- a/pgxn/neon/neon_utils.c
+++ b/pgxn/neon/neon_utils.c
@@ -0,0 +1,116 @@
+#include "postgres.h"
+
+#include "access/timeline.h"
+#include "access/xlogutils.h"
+#include "common/logging.h"
+#include "common/ip.h"
+#include "funcapi.h"
+#include "libpq/libpq.h"
+#include "libpq/pqformat.h"
+#include "miscadmin.h"
+#include "postmaster/interrupt.h"
+#include "replication/slot.h"
+#include "replication/walsender_private.h"
+
+#include "storage/ipc.h"
+#include "utils/builtins.h"
+#include "utils/ps_status.h"
+
+#include "libpq-fe.h"
+#include <netinet/tcp.h>
+#include <unistd.h>
+
+#if PG_VERSION_NUM >= 150000
+#include "access/xlogutils.h"
+#include "access/xlogrecovery.h"
+#endif
+#if PG_MAJORVERSION_NUM >= 16
+#include "utils/guc.h"
+#endif
+
+/*
+ * Convert a character which represents a hexadecimal digit to an integer.
+ *
+ * Returns -1 if the character is not a hexadecimal digit.
+ */
+int
+HexDecodeChar(char c)
+{
+	if (c >= '0' && c <= '9')
+		return c - '0';
+	if (c >= 'a' && c <= 'f')
+		return c - 'a' + 10;
+	if (c >= 'A' && c <= 'F')
+		return c - 'A' + 10;
+
+	return -1;
+}
+
+/*
+ * Decode a hex string into a byte string, 2 hex chars per byte.
+ *
+ * Returns false if invalid characters are encountered; otherwise true.
+ */
+bool
+HexDecodeString(uint8 *result, char *input, int nbytes)
+{
+	int			i;
+
+	for (i = 0; i < nbytes; ++i)
+	{
+		int			n1 = HexDecodeChar(input[i * 2]);
+		int			n2 = HexDecodeChar(input[i * 2 + 1]);
+
+		if (n1 < 0 || n2 < 0)
+			return false;
+		result[i] = n1 * 16 + n2;
+	}
+
+	return true;
+}
+
+/* --------------------------------
+ *		pq_getmsgint32_le	- get a binary 4-byte int from a message buffer in native (LE) order
+ * --------------------------------
+ */
+uint32
+pq_getmsgint32_le(StringInfo msg)
+{
+	uint32		n32;
+
+	pq_copymsgbytes(msg, (char *) &n32, sizeof(n32));
+
+	return n32;
+}
+
+/* --------------------------------
+ *		pq_getmsgint64	- get a binary 8-byte int from a message buffer in native (LE) order
+ * --------------------------------
+ */
+uint64
+pq_getmsgint64_le(StringInfo msg)
+{
+	uint64		n64;
+
+	pq_copymsgbytes(msg, (char *) &n64, sizeof(n64));
+
+	return n64;
+}
+
+/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */
+void
+pq_sendint32_le(StringInfo buf, uint32 i)
+{
+	enlargeStringInfo(buf, sizeof(uint32));
+	memcpy(buf->data + buf->len, &i, sizeof(uint32));
+	buf->len += sizeof(uint32);
+}
+
+/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */
+void
+pq_sendint64_le(StringInfo buf, uint64 i)
+{
+	enlargeStringInfo(buf, sizeof(uint64));
+	memcpy(buf->data + buf->len, &i, sizeof(uint64));
+	buf->len += sizeof(uint64);
+}
--- a/pgxn/neon/neon_utils.h
+++ b/pgxn/neon/neon_utils.h
@@ -0,0 +1,12 @@
+#ifndef __NEON_UTILS_H__
+#define __NEON_UTILS_H__
+
+#include "postgres.h"
+
+bool		HexDecodeString(uint8 *result, char *input, int nbytes);
+uint32		pq_getmsgint32_le(StringInfo msg);
+uint64		pq_getmsgint64_le(StringInfo msg);
+void		pq_sendint32_le(StringInfo buf, uint32 i);
+void		pq_sendint64_le(StringInfo buf, uint64 i);
+
+#endif							/* __NEON_UTILS_H__ */
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -721,7 +721,7 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls

 	/* use an intermediate PrefetchRequest struct to ensure correct alignment */
 	req.buftag = tag;
-	
+  Retry:
 	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req);

 	if (entry != NULL)
@@ -858,7 +858,11 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 	if (flush_every_n_requests > 0 &&
 		MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
 	{
-		page_server->flush();
+		if (!page_server->flush())
+		{
+			/* Prefetch set is reset in case of error, so we should try to register our request once again */
+			goto Retry;
+		}
 		MyPState->ring_flush = MyPState->ring_unused;
 	}

--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -1,8 +1,8 @@
 #ifndef __NEON_WALPROPOSER_H__
 #define __NEON_WALPROPOSER_H__

-#include "access/xlogdefs.h"
 #include "postgres.h"
+#include "access/xlogdefs.h"
 #include "port.h"
 #include "access/xlog_internal.h"
 #include "access/transam.h"
@@ -16,29 +16,15 @@
 #define MAX_SAFEKEEPERS 32
 #define MAX_SEND_SIZE (XLOG_BLCKSZ * 16)	/* max size of a single* WAL
 											 * message */
-#define XLOG_HDR_SIZE (1 + 8 * 3)	/* 'w' + startPos + walEnd + timestamp */
-#define XLOG_HDR_START_POS 1	/* offset of start position in wal sender*
-								 * message header */
-#define XLOG_HDR_END_POS (1 + 8)	/* offset of end position in wal sender*
-									 * message header */
-
 /*
 * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occurred,
 * because all WL_* events are given flags equal to some (1 << i), starting from i = 0
 */
 #define WL_NO_EVENTS 0

-extern char *wal_acceptors_list;
-extern int	wal_acceptor_reconnect_timeout;
-extern int	wal_acceptor_connection_timeout;
-extern bool am_wal_proposer;
-
-struct WalProposerConn;			/* Defined in libpqwalproposer */
+struct WalProposerConn;			/* Defined in implementation (walprop_pg.c) */
 typedef struct WalProposerConn WalProposerConn;

-struct WalMessage;
-typedef struct WalMessage WalMessage;
-
 /* Possible return values from ReadPGAsync */
 typedef enum
 {
@@ -52,7 +38,7 @@ typedef enum
 	PG_ASYNC_READ_TRY_AGAIN,
 	/* Reading failed. Check PQerrorMessage(conn) */
 	PG_ASYNC_READ_FAIL,
-}			PGAsyncReadResult;
+} PGAsyncReadResult;

 /* Possible return values from WritePGAsync */
 typedef enum
@@ -71,7 +57,7 @@ typedef enum
 	PG_ASYNC_WRITE_TRY_FLUSH,
 	/* Writing failed. Check PQerrorMessage(conn) */
 	PG_ASYNC_WRITE_FAIL,
-}			PGAsyncWriteResult;
+} PGAsyncWriteResult;

 /*
 * WAL safekeeper state, which is used to wait for some event.
@@ -147,7 +133,7 @@ typedef enum
 	 * to read.
 	 */
 	SS_ACTIVE,
-}			SafekeeperState;
+} SafekeeperState;

 /* Consensus logical timestamp. */
 typedef uint64 term_t;
@@ -171,12 +157,12 @@ typedef struct ProposerGreeting
 	uint8		tenant_id[16];
 	TimeLineID	timeline;
 	uint32		walSegSize;
-}			ProposerGreeting;
+} ProposerGreeting;

 typedef struct AcceptorProposerMessage
 {
 	uint64		tag;
-}			AcceptorProposerMessage;
+} AcceptorProposerMessage;

 /*
 * Acceptor -> Proposer initial response: the highest term acceptor voted for.
@@ -186,7 +172,7 @@ typedef struct AcceptorGreeting
 	AcceptorProposerMessage apm;
 	term_t		term;
 	NNodeId		nodeId;
-}			AcceptorGreeting;
+} AcceptorGreeting;

 /*
 * Proposer -> Acceptor vote request.
@@ -196,20 +182,20 @@ typedef struct VoteRequest
 	uint64		tag;
 	term_t		term;
 	pg_uuid_t	proposerId;		/* for monitoring/debugging */
-}			VoteRequest;
+} VoteRequest;

 /* Element of term switching chain. */
 typedef struct TermSwitchEntry
 {
 	term_t		term;
 	XLogRecPtr	lsn;
-}			TermSwitchEntry;
+} TermSwitchEntry;

 typedef struct TermHistory
 {
 	uint32		n_entries;
 	TermSwitchEntry *entries;
-}			TermHistory;
+} TermHistory;

 /* Vote itself, sent from safekeeper to proposer */
 typedef struct VoteResponse
@@ -227,7 +213,7 @@ typedef struct VoteResponse
 								 * recovery of some safekeeper */
 	TermHistory termHistory;
 	XLogRecPtr	timelineStartLsn;	/* timeline globally starts at this LSN */
-}			VoteResponse;
+} VoteResponse;

 /*
 * Proposer -> Acceptor message announcing proposer is elected and communicating
@@ -243,7 +229,7 @@ typedef struct ProposerElected
 	TermHistory *termHistory;
 	/* timeline globally starts at this LSN */
 	XLogRecPtr	timelineStartLsn;
-}			ProposerElected;
+} ProposerElected;

 /*
 * Header of request with WAL message sent from proposer to safekeeper.
@@ -268,7 +254,7 @@ typedef struct AppendRequestHeader
 	 */
 	XLogRecPtr	truncateLsn;
 	pg_uuid_t	proposerId;		/* for monitoring/debugging */
-}			AppendRequestHeader;
+} AppendRequestHeader;

 /*
 * Hot standby feedback received from replica
@@ -278,7 +264,7 @@ typedef struct HotStandbyFeedback
 	TimestampTz ts;
 	FullTransactionId xmin;
 	FullTransactionId catalog_xmin;
-}			HotStandbyFeedback;
+} HotStandbyFeedback;

 typedef struct PageserverFeedback
 {
@@ -289,7 +275,7 @@ typedef struct PageserverFeedback
 	XLogRecPtr	disk_consistent_lsn;
 	XLogRecPtr	remote_consistent_lsn;
 	TimestampTz replytime;
-}			PageserverFeedback;
+} PageserverFeedback;

 typedef struct WalproposerShmemState
 {
@@ -297,7 +283,7 @@ typedef struct WalproposerShmemState
 	PageserverFeedback feedback;
 	term_t		mineLastElectedTerm;
 	pg_atomic_uint64 backpressureThrottlingTime;
-}			WalproposerShmemState;
+} WalproposerShmemState;

 /*
 * Report safekeeper state to proposer
@@ -321,17 +307,22 @@ typedef struct AppendResponse
 	/* and custom neon feedback. */
 	/* This part of the message is extensible. */
 	PageserverFeedback rf;
-}			AppendResponse;
+} AppendResponse;

 /*  PageserverFeedback is extensible part of the message that is parsed separately */
 /*  Other fields are fixed part */
 #define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf)

+struct WalProposer;
+typedef struct WalProposer WalProposer;
+
 /*
 * Descriptor of safekeeper
 */
 typedef struct Safekeeper
 {
+	WalProposer *wp;
+
 	char const *host;
 	char const *port;

@@ -340,7 +331,7 @@ typedef struct Safekeeper
 	 *
 	 * May contain private information like password and should not be logged.
 	 */
-	char conninfo[MAXCONNINFO];
+	char		conninfo[MAXCONNINFO];

 	/*
 	 * postgres protocol connection to the WAL acceptor
@@ -373,27 +364,12 @@ typedef struct Safekeeper
 	int			eventPos;		/* position in wait event set. Equal to -1 if*
 								 * no event */
 	SafekeeperState state;		/* safekeeper state machine state */
-	TimestampTz latestMsgReceivedAt;        /* when latest msg is received */
+	TimestampTz latestMsgReceivedAt;	/* when latest msg is received */
 	AcceptorGreeting greetResponse; /* acceptor greeting */
 	VoteResponse voteResponse;	/* the vote */
 	AppendResponse appendResponse;	/* feedback for master */
 } Safekeeper;

-extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
-extern void PGDLLEXPORT WalProposerMain(Datum main_arg);
-extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
-extern void WalProposerPoll(void);
-extern void ParsePageserverFeedbackMessage(StringInfo reply_message,
-											PageserverFeedback *rf);
-extern void StartProposerReplication(StartReplicationCmd *cmd);
-
-extern Size WalproposerShmemSize(void);
-extern bool WalproposerShmemInit(void);
-extern void replication_feedback_set(PageserverFeedback *rf);
-extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
-
-/* libpqwalproposer hooks & helper type */
-
 /* Re-exported PostgresPollingStatusType */
 typedef enum
 {
@@ -406,7 +382,7 @@ typedef enum
 	 * 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused.
 	 * We've removed it here to avoid clutter.
 	 */
-}			WalProposerConnectPollStatusType;
+} WalProposerConnectPollStatusType;

 /* Re-exported and modified ExecStatusType */
 typedef enum
@@ -431,7 +407,7 @@ typedef enum
 	WP_EXEC_NEEDS_INPUT,
 	/* Catch-all failure. Check PQerrorMessage. */
 	WP_EXEC_FAILED,
-}			WalProposerExecStatusType;
+} WalProposerExecStatusType;

 /* Re-exported ConnStatusType */
 typedef enum
@@ -445,67 +421,252 @@ typedef enum
 	 * that extra functionality, so we collect them into a single tag here.
 	 */
 	WP_CONNECTION_IN_PROGRESS,
-}			WalProposerConnStatusType;
-
-/* Re-exported PQerrorMessage */
-extern char *walprop_error_message(WalProposerConn *conn);
-
-/* Re-exported PQstatus */
-extern WalProposerConnStatusType walprop_status(WalProposerConn *conn);
-
-/* Re-exported PQconnectStart */
-extern WalProposerConn * walprop_connect_start(char *conninfo, char *password);
-
-/* Re-exported PQconectPoll */
-extern WalProposerConnectPollStatusType walprop_connect_poll(WalProposerConn *conn);
-
-/* Blocking wrapper around PQsendQuery */
-extern bool walprop_send_query(WalProposerConn *conn, char *query);
-
-/* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */
-extern WalProposerExecStatusType walprop_get_query_result(WalProposerConn *conn);
-
-/* Re-exported PQsocket */
-extern pgsocket walprop_socket(WalProposerConn *conn);
-
-/* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */
-extern int	walprop_flush(WalProposerConn *conn);
-
-/* Re-exported PQfinish */
-extern void walprop_finish(WalProposerConn *conn);
+} WalProposerConnStatusType;

 /*
- * Ergonomic wrapper around PGgetCopyData
- *
- * Reads a CopyData block from a safekeeper, setting *amount to the number
- * of bytes returned.
- *
- * This function is allowed to assume certain properties specific to the
- * protocol with the safekeepers, so it should not be used as-is for any
- * other purpose.
- *
- * Note: If possible, using <AsyncRead> is generally preferred, because it
- * performs a bit of extra checking work that's always required and is normally
- * somewhat verbose.
+ * Collection of hooks for walproposer, to call postgres functions,
+ * read WAL and send it over the network.
 */
-extern PGAsyncReadResult walprop_async_read(WalProposerConn *conn, char **buf, int *amount);
+typedef struct walproposer_api
+{
+	/*
+	 * Get WalproposerShmemState. This is used to store information about last
+	 * elected term.
+	 */
+	WalproposerShmemState *(*get_shmem_state) (void);
+
+	/*
+	 * Start receiving notifications about new WAL. This is an infinite loop
+	 * which calls WalProposerBroadcast() and WalProposerPoll() to send the
+	 * WAL.
+	 */
+	void		(*start_streaming) (WalProposer *wp, XLogRecPtr startpos);
+
+	/* Get pointer to the latest available WAL. */
+	XLogRecPtr	(*get_flush_rec_ptr) (void);
+
+	/* Get current time. */
+	TimestampTz (*get_current_timestamp) (void);
+
+	/* Get postgres timeline. */
+	TimeLineID	(*get_timeline_id) (void);
+
+	/* Current error message, aka PQerrorMessage. */
+	char	   *(*conn_error_message) (WalProposerConn *conn);
+
+	/* Connection status, aka PQstatus. */
+	WalProposerConnStatusType (*conn_status) (WalProposerConn *conn);
+
+	/* Start the connection, aka PQconnectStart. */
+	WalProposerConn *(*conn_connect_start) (char *conninfo);
+
+	/* Poll an asynchronous connection, aka PQconnectPoll. */
+	WalProposerConnectPollStatusType (*conn_connect_poll) (WalProposerConn *conn);
+
+	/* Send a blocking SQL query, aka PQsendQuery. */
+	bool		(*conn_send_query) (WalProposerConn *conn, char *query);
+
+	/* Read the query result, aka PQgetResult. */
+	WalProposerExecStatusType (*conn_get_query_result) (WalProposerConn *conn);
+
+	/* Flush buffer to the network, aka PQflush. */
+	int			(*conn_flush) (WalProposerConn *conn);
+
+	/* Close the connection, aka PQfinish. */
+	void		(*conn_finish) (WalProposerConn *conn);
+
+	/* Try to read CopyData message, aka PQgetCopyData. */
+	PGAsyncReadResult (*conn_async_read) (WalProposerConn *conn, char **buf, int *amount);
+
+	/* Try to write CopyData message, aka PQputCopyData. */
+	PGAsyncWriteResult (*conn_async_write) (WalProposerConn *conn, void const *buf, size_t size);
+
+	/* Blocking CopyData write, aka PQputCopyData + PQflush. */
+	bool		(*conn_blocking_write) (WalProposerConn *conn, void const *buf, size_t size);
+
+	/* Download WAL from startpos to endpos and make it available locally. */
+	bool		(*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
+
+	/* Read WAL from disk to buf. */
+	void		(*wal_read) (XLogReaderState *state, char *buf, XLogRecPtr startptr, Size count);
+
+	/* Allocate WAL reader. */
+	XLogReaderState *(*wal_reader_allocate) (void);
+
+	/* Deallocate event set. */
+	void		(*free_event_set) (void);
+
+	/* Initialize event set. */
+	void		(*init_event_set) (int n_safekeepers);
+
+	/* Update events for an existing safekeeper connection. */
+	void		(*update_event_set) (Safekeeper *sk, uint32 events);
+
+	/* Add a new safekeeper connection to the event set. */
+	void		(*add_safekeeper_event_set) (Safekeeper *sk, uint32 events);
+
+	/*
+	 * Wait until some event happens: - timeout is reached - socket event for
+	 * safekeeper connection - new WAL is available
+	 *
+	 * Returns 0 if timeout is reached, 1 if some event happened. Updates
+	 * events mask to indicate events and sets sk to the safekeeper which has
+	 * an event.
+	 */
+	int			(*wait_event_set) (long timeout, Safekeeper **sk, uint32 *events);
+
+	/* Read random bytes. */
+	bool		(*strong_random) (void *buf, size_t len);
+
+	/*
+	 * Get a basebackup LSN. Used to cross-validate with the latest available
+	 * LSN on the safekeepers.
+	 */
+	XLogRecPtr	(*get_redo_start_lsn) (void);
+
+	/*
+	 * Finish sync safekeepers with the given LSN. This function should not
+	 * return and should exit the program.
+	 */
+	void		(*finish_sync_safekeepers) (XLogRecPtr lsn);
+
+	/*
+	 * Called after every new message from the safekeeper. Used to propagate
+	 * backpressure feedback and to confirm WAL persistence (has been commited
+	 * on the quorum of safekeepers).
+	 */
+	void		(*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn);
+
+	/*
+	 * Called on peer_horizon_lsn updates. Used to advance replication slot
+	 * and to free up disk space by deleting unnecessary WAL.
+	 */
+	void		(*confirm_wal_streamed) (XLogRecPtr lsn);
+} walproposer_api;

 /*
- * Ergonomic wrapper around PQputCopyData + PQflush
- *
- * Starts to write a CopyData block to a safekeeper.
- *
- * For information on the meaning of return codes, refer to PGAsyncWriteResult.
+ * Configuration of the WAL proposer.
 */
-extern PGAsyncWriteResult walprop_async_write(WalProposerConn *conn, void const *buf, size_t size);
+typedef struct WalProposerConfig
+{
+	/* hex-encoded TenantId cstr */
+	char	   *neon_tenant;
+
+	/* hex-encoded TimelineId cstr */
+	char	   *neon_timeline;
+
+	/*
+	 * Comma-separated list of safekeepers, in the following format:
+	 * host1:port1,host2:port2,host3:port3
+	 *
+	 * This cstr should be editable.
+	 */
+	char	   *safekeepers_list;
+
+	/*
+	 * WalProposer reconnects to offline safekeepers once in this interval.
+	 * Time is in milliseconds.
+	 */
+	int			safekeeper_reconnect_timeout;
+
+	/*
+	 * WalProposer terminates the connection if it doesn't receive any message
+	 * from the safekeeper in this interval. Time is in milliseconds.
+	 */
+	int			safekeeper_connection_timeout;
+
+	/*
+	 * WAL segment size. Will be passed to safekeepers in greet request. Also
+	 * used to detect page headers.
+	 */
+	int			wal_segment_size;
+
+	/*
+	 * If safekeeper was started in sync mode, walproposer will not subscribe
+	 * for new WAL and will exit when quorum of safekeepers will be synced to
+	 * the latest available LSN.
+	 */
+	bool		syncSafekeepers;
+
+	/* Will be passed to safekeepers in greet request. */
+	uint64		systemId;
+} WalProposerConfig;
+

 /*
- * Blocking equivalent to walprop_async_write_fn
- *
- * Returns 'true' if successful, 'false' on failure.
+ * WAL proposer state.
 */
-extern bool walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size);
+typedef struct WalProposer
+{
+	WalProposerConfig *config;
+	int			n_safekeepers;

-extern uint64 BackpressureThrottlingTime(void);
+	/* (n_safekeepers / 2) + 1 */
+	int			quorum;
+
+	Safekeeper	safekeeper[MAX_SAFEKEEPERS];
+
+	/* WAL has been generated up to this point */
+	XLogRecPtr	availableLsn;
+
+	/* last commitLsn broadcasted to safekeepers */
+	XLogRecPtr	lastSentCommitLsn;
+
+	ProposerGreeting greetRequest;
+
+	/* Vote request for safekeeper */
+	VoteRequest voteRequest;
+
+	/*
+	 * Minimal LSN which may be needed for recovery of some safekeeper,
+	 * record-aligned (first record which might not yet received by someone).
+	 */
+	XLogRecPtr	truncateLsn;
+
+	/*
+	 * Term of the proposer. We want our term to be highest and unique, so we
+	 * collect terms from safekeepers quorum, choose max and +1. After that
+	 * our term is fixed and must not change. If we observe that some
+	 * safekeeper has higher term, it means that we have another running
+	 * compute, so we must stop immediately.
+	 */
+	term_t		propTerm;
+
+	/* term history of the proposer */
+	TermHistory propTermHistory;
+
+	/* epoch start lsn of the proposer */
+	XLogRecPtr	propEpochStartLsn;
+
+	/* Most advanced acceptor epoch */
+	term_t		donorEpoch;
+
+	/* Most advanced acceptor */
+	int			donor;
+
+	/* timeline globally starts at this LSN */
+	XLogRecPtr	timelineStartLsn;
+
+	/* number of votes collected from safekeepers */
+	int			n_votes;
+
+	/* number of successful connections over the lifetime of walproposer */
+	int			n_connected;
+
+	/*
+	 * Timestamp of the last reconnection attempt. Related to
+	 * config->safekeeper_reconnect_timeout
+	 */
+	TimestampTz last_reconnect_attempt;
+
+	walproposer_api api;
+} WalProposer;
+
+extern WalProposer *WalProposerCreate(WalProposerConfig *config, walproposer_api api);
+extern void WalProposerStart(WalProposer *wp);
+extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPtr endpos);
+extern void WalProposerPoll(WalProposer *wp);
+extern void ParsePageserverFeedbackMessage(StringInfo reply_message,
+										   PageserverFeedback *rf);

 #endif							/* __NEON_WALPROPOSER_H__ */
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
--- a/pgxn/neon/walproposer_utils.c
+++ b/pgxn/neon/walproposer_utils.c
@@ -1,659 +0,0 @@
-#include "postgres.h"
-
-#include "access/timeline.h"
-#include "access/xlogutils.h"
-#include "common/logging.h"
-#include "common/ip.h"
-#include "funcapi.h"
-#include "libpq/libpq.h"
-#include "libpq/pqformat.h"
-#include "miscadmin.h"
-#include "postmaster/interrupt.h"
-#include "replication/slot.h"
-#include "walproposer_utils.h"
-#include "replication/walsender_private.h"
-
-#include "storage/ipc.h"
-#include "utils/builtins.h"
-#include "utils/ps_status.h"
-
-#include "libpq-fe.h"
-#include <netinet/tcp.h>
-#include <unistd.h>
-
-#if PG_VERSION_NUM >= 150000
-#include "access/xlogutils.h"
-#include "access/xlogrecovery.h"
-#endif
-#if PG_MAJORVERSION_NUM >= 16
-#include "utils/guc.h"
-#endif
-
-/*
- * These variables are used similarly to openLogFile/SegNo,
- * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID
- * corresponding the filename of walpropFile.
- */
-static int	walpropFile = -1;
-static TimeLineID walpropFileTLI = 0;
-static XLogSegNo walpropSegNo = 0;
-
-/* START cloned file-local variables and functions from walsender.c */
-
-/*
- * How far have we sent WAL already? This is also advertised in
- * MyWalSnd->sentPtr.  (Actually, this is the next WAL location to send.)
- */
-static XLogRecPtr sentPtr = InvalidXLogRecPtr;
-
-static void WalSndLoop(void);
-static void XLogBroadcastWalProposer(void);
-/* END cloned file-level variables and functions from walsender.c */
-
-int
-CompareLsn(const void *a, const void *b)
-{
-	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
-	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
-
-	if (lsn1 < lsn2)
-		return -1;
-	else if (lsn1 == lsn2)
-		return 0;
-	else
-		return 1;
-}
-
-/* Returns a human-readable string corresonding to the SafekeeperState
- *
- * The string should not be freed.
- *
- * The strings are intended to be used as a prefix to "state", e.g.:
- *
- *   elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
- *
- * If this sort of phrasing doesn't fit the message, instead use something like:
- *
- *   elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
- */
-char *
-FormatSafekeeperState(SafekeeperState state)
-{
-	char	   *return_val = NULL;
-
-	switch (state)
-	{
-		case SS_OFFLINE:
-			return_val = "offline";
-			break;
-		case SS_CONNECTING_READ:
-		case SS_CONNECTING_WRITE:
-			return_val = "connecting";
-			break;
-		case SS_WAIT_EXEC_RESULT:
-			return_val = "receiving query result";
-			break;
-		case SS_HANDSHAKE_RECV:
-			return_val = "handshake (receiving)";
-			break;
-		case SS_VOTING:
-			return_val = "voting";
-			break;
-		case SS_WAIT_VERDICT:
-			return_val = "wait-for-verdict";
-			break;
-		case SS_SEND_ELECTED_FLUSH:
-			return_val = "send-announcement-flush";
-			break;
-		case SS_IDLE:
-			return_val = "idle";
-			break;
-		case SS_ACTIVE:
-			return_val = "active";
-			break;
-	}
-
-	Assert(return_val != NULL);
-
-	return return_val;
-}
-
-/* Asserts that the provided events are expected for given safekeeper's state */
-void
-AssertEventsOkForState(uint32 events, Safekeeper *sk)
-{
-	uint32		expected = SafekeeperStateDesiredEvents(sk->state);
-
-	/*
-	 * The events are in-line with what we're expecting, under two conditions:
-	 * (a) if we aren't expecting anything, `events` has no read- or
-	 * write-ready component. (b) if we are expecting something, there's
-	 * overlap (i.e. `events & expected != 0`)
-	 */
-	bool		events_ok_for_state;	/* long name so the `Assert` is more
-										 * clear later */
-
-	if (expected == WL_NO_EVENTS)
-		events_ok_for_state = ((events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) == 0);
-	else
-		events_ok_for_state = ((events & expected) != 0);
-
-	if (!events_ok_for_state)
-	{
-		/*
-		 * To give a descriptive message in the case of failure, we use elog
-		 * and then an assertion that's guaranteed to fail.
-		 */
-		elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
-			 FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state));
-		Assert(events_ok_for_state);
-	}
-}
-
-/* Returns the set of events a safekeeper in this state should be waiting on
- *
- * This will return WL_NO_EVENTS (= 0) for some events. */
-uint32
-SafekeeperStateDesiredEvents(SafekeeperState state)
-{
-	uint32		result = WL_NO_EVENTS;
-
-	/* If the state doesn't have a modifier, we can check the base state */
-	switch (state)
-	{
-			/* Connecting states say what they want in the name */
-		case SS_CONNECTING_READ:
-			result = WL_SOCKET_READABLE;
-			break;
-		case SS_CONNECTING_WRITE:
-			result = WL_SOCKET_WRITEABLE;
-			break;
-
-			/* Reading states need the socket to be read-ready to continue */
-		case SS_WAIT_EXEC_RESULT:
-		case SS_HANDSHAKE_RECV:
-		case SS_WAIT_VERDICT:
-			result = WL_SOCKET_READABLE;
-			break;
-
-			/*
-			 * Idle states use read-readiness as a sign that the connection
-			 * has been disconnected.
-			 */
-		case SS_VOTING:
-		case SS_IDLE:
-			result = WL_SOCKET_READABLE;
-			break;
-
-			/*
-			 * Flush states require write-ready for flushing. Active state
-			 * does both reading and writing.
-			 *
-			 * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We
-			 * should check sk->flushWrite here to set WL_SOCKET_WRITEABLE.
-			 */
-		case SS_SEND_ELECTED_FLUSH:
-		case SS_ACTIVE:
-			result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
-			break;
-
-			/* The offline state expects no events. */
-		case SS_OFFLINE:
-			result = WL_NO_EVENTS;
-			break;
-
-		default:
-			Assert(false);
-			break;
-	}
-
-	return result;
-}
-
-/* Returns a human-readable string corresponding to the event set
- *
- * If the events do not correspond to something set as the `events` field of a `WaitEvent`, the
- * returned string may be meaingless.
- *
- * The string should not be freed. It should also not be expected to remain the same between
- * function calls. */
-char *
-FormatEvents(uint32 events)
-{
-	static char return_str[8];
-
-	/* Helper variable to check if there's extra bits */
-	uint32		all_flags = WL_LATCH_SET
-	| WL_SOCKET_READABLE
-	| WL_SOCKET_WRITEABLE
-	| WL_TIMEOUT
-	| WL_POSTMASTER_DEATH
-	| WL_EXIT_ON_PM_DEATH
-	| WL_SOCKET_CONNECTED;
-
-	/*
-	 * The formatting here isn't supposed to be *particularly* useful -- it's
-	 * just to give an sense of what events have been triggered without
-	 * needing to remember your powers of two.
-	 */
-
-	return_str[0] = (events & WL_LATCH_SET) ? 'L' : '_';
-	return_str[1] = (events & WL_SOCKET_READABLE) ? 'R' : '_';
-	return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_';
-	return_str[3] = (events & WL_TIMEOUT) ? 'T' : '_';
-	return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_';
-	return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_';
-	return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_';
-
-	if (events & (~all_flags))
-	{
-		elog(WARNING, "Event formatting found unexpected component %d",
-			 events & (~all_flags));
-		return_str[6] = '*';
-		return_str[7] = '\0';
-	}
-	else
-		return_str[6] = '\0';
-
-	return (char *) &return_str;
-}
-
-/*
- * Convert a character which represents a hexadecimal digit to an integer.
- *
- * Returns -1 if the character is not a hexadecimal digit.
- */
-static int
-HexDecodeChar(char c)
-{
-	if (c >= '0' && c <= '9')
-		return c - '0';
-	if (c >= 'a' && c <= 'f')
-		return c - 'a' + 10;
-	if (c >= 'A' && c <= 'F')
-		return c - 'A' + 10;
-
-	return -1;
-}
-
-/*
- * Decode a hex string into a byte string, 2 hex chars per byte.
- *
- * Returns false if invalid characters are encountered; otherwise true.
- */
-bool
-HexDecodeString(uint8 *result, char *input, int nbytes)
-{
-	int			i;
-
-	for (i = 0; i < nbytes; ++i)
-	{
-		int			n1 = HexDecodeChar(input[i * 2]);
-		int			n2 = HexDecodeChar(input[i * 2 + 1]);
-
-		if (n1 < 0 || n2 < 0)
-			return false;
-		result[i] = n1 * 16 + n2;
-	}
-
-	return true;
-}
-
-/* --------------------------------
- *		pq_getmsgint32_le	- get a binary 4-byte int from a message buffer in native (LE) order
- * --------------------------------
- */
-uint32
-pq_getmsgint32_le(StringInfo msg)
-{
-	uint32		n32;
-
-	pq_copymsgbytes(msg, (char *) &n32, sizeof(n32));
-
-	return n32;
-}
-
-/* --------------------------------
- *		pq_getmsgint64	- get a binary 8-byte int from a message buffer in native (LE) order
- * --------------------------------
- */
-uint64
-pq_getmsgint64_le(StringInfo msg)
-{
-	uint64		n64;
-
-	pq_copymsgbytes(msg, (char *) &n64, sizeof(n64));
-
-	return n64;
-}
-
-/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */
-void
-pq_sendint32_le(StringInfo buf, uint32 i)
-{
-	enlargeStringInfo(buf, sizeof(uint32));
-	memcpy(buf->data + buf->len, &i, sizeof(uint32));
-	buf->len += sizeof(uint32);
-}
-
-/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */
-void
-pq_sendint64_le(StringInfo buf, uint64 i)
-{
-	enlargeStringInfo(buf, sizeof(uint64));
-	memcpy(buf->data + buf->len, &i, sizeof(uint64));
-	buf->len += sizeof(uint64);
-}
-
-/*
- * Write XLOG data to disk.
- */
-void
-XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr)
-{
-	int			startoff;
-	int			byteswritten;
-
-	while (nbytes > 0)
-	{
-		int			segbytes;
-
-		/* Close the current segment if it's completed */
-		if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
-			XLogWalPropClose(recptr);
-
-		if (walpropFile < 0)
-		{
-#if PG_VERSION_NUM >= 150000
-			/* FIXME Is it ok to use hardcoded value here? */
-			TimeLineID	tli = 1;
-#else
-			bool		use_existent = true;
-#endif
-			/* Create/use new log file */
-			XLByteToSeg(recptr, walpropSegNo, wal_segment_size);
-#if PG_VERSION_NUM >= 150000
-			walpropFile = XLogFileInit(walpropSegNo, tli);
-			walpropFileTLI = tli;
-#else
-			walpropFile = XLogFileInit(walpropSegNo, &use_existent, false);
-			walpropFileTLI = ThisTimeLineID;
-#endif
-		}
-
-		/* Calculate the start offset of the received logs */
-		startoff = XLogSegmentOffset(recptr, wal_segment_size);
-
-		if (startoff + nbytes > wal_segment_size)
-			segbytes = wal_segment_size - startoff;
-		else
-			segbytes = nbytes;
-
-		/* OK to write the logs */
-		errno = 0;
-
-		byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff);
-		if (byteswritten <= 0)
-		{
-			char		xlogfname[MAXFNAMELEN];
-			int			save_errno;
-
-			/* if write didn't set errno, assume no disk space */
-			if (errno == 0)
-				errno = ENOSPC;
-
-			save_errno = errno;
-			XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
-			errno = save_errno;
-			ereport(PANIC,
-					(errcode_for_file_access(),
-					 errmsg("could not write to log segment %s "
-							"at offset %u, length %lu: %m",
-							xlogfname, startoff, (unsigned long) segbytes)));
-		}
-
-		/* Update state for write */
-		recptr += byteswritten;
-
-		nbytes -= byteswritten;
-		buf += byteswritten;
-	}
-
-	/*
-	 * Close the current segment if it's fully written up in the last cycle of
-	 * the loop.
-	 */
-	if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
-	{
-		XLogWalPropClose(recptr);
-	}
-}
-
-/*
- * Close the current segment.
- */
-void
-XLogWalPropClose(XLogRecPtr recptr)
-{
-	Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size));
-
-	if (close(walpropFile) != 0)
-	{
-		char		xlogfname[MAXFNAMELEN];
-
-		XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
-
-		ereport(PANIC,
-				(errcode_for_file_access(),
-				 errmsg("could not close log segment %s: %m",
-						xlogfname)));
-	}
-
-	walpropFile = -1;
-}
-
-/* START of cloned functions from walsender.c */
-
-/*
- * Subscribe for new WAL and stream it in the loop to safekeepers.
- *
- * At the moment, this never returns, but an ereport(ERROR) will take us back
- * to the main loop.
- */
-void
-StartProposerReplication(StartReplicationCmd *cmd)
-{
-	XLogRecPtr	FlushPtr;
-	TimeLineID	currTLI;
-
-#if PG_VERSION_NUM < 150000
-	if (ThisTimeLineID == 0)
-		ereport(ERROR,
-				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				 errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION")));
-#endif
-
-	/*
-	 * We assume here that we're logging enough information in the WAL for
-	 * log-shipping, since this is checked in PostmasterMain().
-	 *
-	 * NOTE: wal_level can only change at shutdown, so in most cases it is
-	 * difficult for there to be WAL data that we can still see that was
-	 * written at wal_level='minimal'.
-	 */
-
-	if (cmd->slotname)
-	{
-		ReplicationSlotAcquire(cmd->slotname, true);
-		if (SlotIsLogical(MyReplicationSlot))
-			ereport(ERROR,
-					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-					 errmsg("cannot use a logical replication slot for physical replication")));
-
-		/*
-		 * We don't need to verify the slot's restart_lsn here; instead we
-		 * rely on the caller requesting the starting point to use.  If the
-		 * WAL segment doesn't exist, we'll fail later.
-		 */
-	}
-
-	/*
-	 * Select the timeline. If it was given explicitly by the client, use
-	 * that. Otherwise use the timeline of the last replayed record, which is
-	 * kept in ThisTimeLineID.
-	 *
-	 * Neon doesn't currently use PG Timelines, but it may in the future, so
-	 * we keep this code around to lighten the load for when we need it.
-	 */
-#if PG_VERSION_NUM >= 150000
-	FlushPtr = GetFlushRecPtr(&currTLI);
-#else
-	FlushPtr = GetFlushRecPtr();
-	currTLI = ThisTimeLineID;
-#endif
-
-	/*
-	 * When we first start replication the standby will be behind the
-	 * primary. For some applications, for example synchronous
-	 * replication, it is important to have a clear state for this initial
-	 * catchup mode, so we can trigger actions when we change streaming
-	 * state later. We may stay in this state for a long time, which is
-	 * exactly why we want to be able to monitor whether or not we are
-	 * still here.
-	 */
-	WalSndSetState(WALSNDSTATE_CATCHUP);
-
-	/*
-	 * Don't allow a request to stream from a future point in WAL that
-	 * hasn't been flushed to disk in this server yet.
-	 */
-	if (FlushPtr < cmd->startpoint)
-	{
-		ereport(ERROR,
-				(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
-						LSN_FORMAT_ARGS(cmd->startpoint),
-						LSN_FORMAT_ARGS(FlushPtr))));
-	}
-
-	/* Start streaming from the requested point */
-	sentPtr = cmd->startpoint;
-
-	/* Initialize shared memory status, too */
-	SpinLockAcquire(&MyWalSnd->mutex);
-	MyWalSnd->sentPtr = sentPtr;
-	SpinLockRelease(&MyWalSnd->mutex);
-
-	SyncRepInitConfig();
-
-	/* Infinite send loop, never returns */
-	WalSndLoop();
-
-	WalSndSetState(WALSNDSTATE_STARTUP);
-
-	if (cmd->slotname)
-		ReplicationSlotRelease();
-}
-
-/*
- * Main loop that waits for LSN updates and calls the walproposer.
- * Synchronous replication sets latch in WalSndWakeup at walsender.c
- */
-static void
-WalSndLoop(void)
-{
-	/* Clear any already-pending wakeups */
-	ResetLatch(MyLatch);
-
-	for (;;)
-	{
-		CHECK_FOR_INTERRUPTS();
-
-		XLogBroadcastWalProposer();
-
-		if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
-			WalSndSetState(WALSNDSTATE_STREAMING);
-		WalProposerPoll();
-	}
-}
-
-/*
- * Notify walproposer about the new WAL position.
- */
-static void
-XLogBroadcastWalProposer(void)
-{
-	XLogRecPtr	startptr;
-	XLogRecPtr	endptr;
-
-	/* Start from the last sent position */
-	startptr = sentPtr;
-
-	/*
-	 * Streaming the current timeline on a primary.
-	 *
-	 * Attempt to send all data that's already been written out and
-	 * fsync'd to disk.  We cannot go further than what's been written out
-	 * given the current implementation of WALRead().  And in any case
-	 * it's unsafe to send WAL that is not securely down to disk on the
-	 * primary: if the primary subsequently crashes and restarts, standbys
-	 * must not have applied any WAL that got lost on the primary.
-	 */
-#if PG_VERSION_NUM >= 150000
-	endptr = GetFlushRecPtr(NULL);
-#else
-	endptr = GetFlushRecPtr();
-#endif
-
-	/*
-	 * Record the current system time as an approximation of the time at which
-	 * this WAL location was written for the purposes of lag tracking.
-	 *
-	 * In theory we could make XLogFlush() record a time in shmem whenever WAL
-	 * is flushed and we could get that time as well as the LSN when we call
-	 * GetFlushRecPtr() above (and likewise for the cascading standby
-	 * equivalent), but rather than putting any new code into the hot WAL path
-	 * it seems good enough to capture the time here.  We should reach this
-	 * after XLogFlush() runs WalSndWakeupProcessRequests(), and although that
-	 * may take some time, we read the WAL flush pointer and take the time
-	 * very close to together here so that we'll get a later position if it is
-	 * still moving.
-	 *
-	 * Because LagTrackerWrite ignores samples when the LSN hasn't advanced,
-	 * this gives us a cheap approximation for the WAL flush time for this
-	 * LSN.
-	 *
-	 * Note that the LSN is not necessarily the LSN for the data contained in
-	 * the present message; it's the end of the WAL, which might be further
-	 * ahead.  All the lag tracking machinery cares about is finding out when
-	 * that arbitrary LSN is eventually reported as written, flushed and
-	 * applied, so that it can measure the elapsed time.
-	 */
-	LagTrackerWrite(endptr, GetCurrentTimestamp());
-
-	/* Do we have any work to do? */
-	Assert(startptr <= endptr);
-	if (endptr <= startptr)
-		return;
-
-	WalProposerBroadcast(startptr, endptr);
-	sentPtr = endptr;
-
-	/* Update shared memory status */
-	{
-		WalSnd	   *walsnd = MyWalSnd;
-
-		SpinLockAcquire(&walsnd->mutex);
-		walsnd->sentPtr = sentPtr;
-		SpinLockRelease(&walsnd->mutex);
-	}
-
-	/* Report progress of XLOG streaming in PS display */
-	if (update_process_title)
-	{
-		char		activitymsg[50];
-
-		snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X",
-				 LSN_FORMAT_ARGS(sentPtr));
-		set_ps_display(activitymsg);
-	}
-}
--- a/pgxn/neon/walproposer_utils.h
+++ b/pgxn/neon/walproposer_utils.h
@@ -1,19 +0,0 @@
-#ifndef __NEON_WALPROPOSER_UTILS_H__
-#define __NEON_WALPROPOSER_UTILS_H__
-
-#include "walproposer.h"
-
-int			CompareLsn(const void *a, const void *b);
-char	   *FormatSafekeeperState(SafekeeperState state);
-void		AssertEventsOkForState(uint32 events, Safekeeper *sk);
-uint32		SafekeeperStateDesiredEvents(SafekeeperState state);
-char	   *FormatEvents(uint32 events);
-bool		HexDecodeString(uint8 *result, char *input, int nbytes);
-uint32		pq_getmsgint32_le(StringInfo msg);
-uint64		pq_getmsgint64_le(StringInfo msg);
-void		pq_sendint32_le(StringInfo buf, uint32 i);
-void		pq_sendint64_le(StringInfo buf, uint64 i);
-void		XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr);
-void		XLogWalPropClose(XLogRecPtr recptr);
-
-#endif							/* __NEON_WALPROPOSER_UTILS_H__ */
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -89,7 +89,10 @@ pub mod errors {
                Self::Console {
                    status: http::StatusCode::LOCKED,
                    ref text,
-                } => !text.contains("quota"),
+                } => {
+                    !text.contains("written data quota exceeded")
+                        && !text.contains("the limit for current plan reached")
+                }
                // retry server errors
                Self::Console { status, .. } if status.is_server_error() => true,
                _ => false,
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -47,6 +47,7 @@ enum Payload {

 const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB
 const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB
+const HTTP_CONNECTION_TIMEOUT: tokio::time::Duration = tokio::time::Duration::from_secs(15);

 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
@@ -189,27 +190,44 @@ pub async fn handle(
    conn_pool: Arc<GlobalConnPool>,
    session_id: uuid::Uuid,
 ) -> Result<Response<Body>, ApiError> {
-    let result = handle_inner(request, sni_hostname, conn_pool, session_id).await;
-
+    let result = tokio::time::timeout(
+        HTTP_CONNECTION_TIMEOUT,
+        handle_inner(request, sni_hostname, conn_pool, session_id),
+    )
+    .await;
    let mut response = match result {
-        Ok(r) => r,
-        Err(e) => {
-            let message = format!("{:?}", e);
-            let code = match e.downcast_ref::<tokio_postgres::Error>() {
-                Some(e) => match e.code() {
-                    Some(e) => serde_json::to_value(e.code()).unwrap(),
+        Ok(r) => match r {
+            Ok(r) => r,
+            Err(e) => {
+                let message = format!("{:?}", e);
+                let code = e.downcast_ref::<tokio_postgres::Error>().and_then(|e| {
+                    e.code()
+                        .map(|s| serde_json::to_value(s.code()).unwrap_or_default())
+                });
+                let code = match code {
+                    Some(c) => c,
                    None => Value::Null,
-                },
-                None => Value::Null,
-            };
-            error!(
-                ?code,
-                "sql-over-http per-client task finished with an error: {e:#}"
+                };
+                error!(
+                    ?code,
+                    "sql-over-http per-client task finished with an error: {e:#}"
+                );
+                // TODO: this shouldn't always be bad request.
+                json_response(
+                    StatusCode::BAD_REQUEST,
+                    json!({ "message": message, "code": code }),
+                )?
+            }
+        },
+        Err(_) => {
+            let message = format!(
+                "HTTP-Connection timed out, execution time exeeded {} seconds",
+                HTTP_CONNECTION_TIMEOUT.as_secs()
            );
-            // TODO: this shouldn't always be bad request.
+            error!(message);
            json_response(
-                StatusCode::BAD_REQUEST,
-                json!({ "message": message, "code": code }),
+                StatusCode::GATEWAY_TIMEOUT,
+                json!({ "message": message, "code": StatusCode::GATEWAY_TIMEOUT.as_u16() }),
            )?
        }
    };
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -7,6 +7,7 @@ use crate::{
    compute::{self, PostgresConnection},
    config::{ProxyConfig, TlsConfig},
    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
+    http::StatusCode,
    metrics::{Ids, USAGE_METRICS},
    protocol2::WithClientIp,
    stream::{PqStream, Stream},
@@ -75,6 +76,15 @@ static NUM_CONNECTION_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
    .unwrap()
 });

+static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_connection_failures_breakdown",
+        "Number of wake-up failures (per kind).",
+        &["retry", "kind"],
+    )
+    .unwrap()
+});
+
 static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "proxy_io_bytes_per_client",
@@ -397,6 +407,46 @@ impl ConnectMechanism for TcpMechanism<'_> {
    }
 }

+const fn bool_to_str(x: bool) -> &'static str {
+    if x {
+        "true"
+    } else {
+        "false"
+    }
+}
+
+fn report_error(e: &WakeComputeError, retry: bool) {
+    use crate::console::errors::ApiError;
+    let retry = bool_to_str(retry);
+    let kind = match e {
+        WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
+        WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::LOCKED,
+            ref text,
+        }) if text.contains("written data quota exceeded")
+            || text.contains("the limit for current plan reached") =>
+        {
+            "quota_exceeded"
+        }
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::LOCKED,
+            ..
+        }) => "api_console_locked",
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::BAD_REQUEST,
+            ..
+        }) => "api_console_bad_request",
+        WakeComputeError::ApiError(ApiError::Console { status, .. })
+            if status.is_server_error() =>
+        {
+            "api_console_other_server_error"
+        }
+        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
+    };
+    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
+}
+
 /// Try to connect to the compute node, retrying if necessary.
 /// This function might update `node_info`, so we take it by `&mut`.
 #[tracing::instrument(skip_all)]
@@ -440,10 +490,12 @@ where
        match handle_try_wake(wake_res, num_retries) {
            Err(e) => {
                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
+                report_error(&e, false);
                return Err(e.into());
            }
            // failed to wake up but we can continue to retry
            Ok(ControlFlow::Continue(e)) => {
+                report_error(&e, true);
                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
            }
            // successfully woke up a compute node and can break the wakeup loop
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -723,9 +723,9 @@ impl Timeline {
            if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
                return Ok(()); // nothing to do
            }
-            let remover = shared_state.sk.wal_store.remove_up_to(horizon_segno - 1);
+
            // release the lock before removing
-            remover
+            shared_state.sk.wal_store.remove_up_to(horizon_segno - 1)
        };

        // delete old WAL files
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1085,15 +1085,32 @@ class AbstractNeonCli(abc.ABC):
            stderr=subprocess.PIPE,
            timeout=timeout,
        )
+
+        indent = "  "
        if not res.returncode:
-            log.info(f"Run {res.args} success: {res.stdout}")
+            stripped = res.stdout.strip()
+            lines = stripped.splitlines()
+            if len(lines) < 2:
+                log.debug(f"Run {res.args} success: {stripped}")
+            else:
+                log.debug("Run %s success:\n%s" % (res.args, textwrap.indent(stripped, indent)))
        elif check_return_code:
            # this way command output will be in recorded and shown in CI in failure message
-            msg = f"""\
-            Run {res.args} failed:
-              stdout: {res.stdout}
-              stderr: {res.stderr}
+            indent = indent * 2
+            msg = textwrap.dedent(
+                """\
+            Run %s failed:
+              stdout:
+            %s
+              stderr:
+            %s
            """
+            )
+            msg = msg % (
+                res.args,
+                textwrap.indent(res.stdout.strip(), indent),
+                textwrap.indent(res.stderr.strip(), indent),
+            )
            log.info(msg)
            raise RuntimeError(msg) from subprocess.CalledProcessError(
                res.returncode, res.args, res.stdout, res.stderr
@@ -1447,6 +1464,29 @@ class NeonCli(AbstractNeonCli):

        return self.raw_cli(args, check_return_code=check_return_code)

+    def map_branch(
+        self, name: str, tenant_id: TenantId, timeline_id: TimelineId
+    ) -> "subprocess.CompletedProcess[str]":
+        """
+        Map tenant id and timeline id to a neon_local branch name. They do not have to exist.
+        Usually needed when creating branches via PageserverHttpClient and not neon_local.
+
+        After creating a name mapping, you can use EndpointFactory.create_start
+        with this registered branch name.
+        """
+        args = [
+            "mappings",
+            "map",
+            "--branch-name",
+            name,
+            "--tenant-id",
+            str(tenant_id),
+            "--timeline-id",
+            str(timeline_id),
+        ]
+
+        return self.raw_cli(args, check_return_code=True)
+
    def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]":
        return self.raw_cli(["start"], check_return_code=check_return_code)

--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -74,11 +74,14 @@ def wait_until_tenant_state(
    for _ in range(iterations):
        try:
            tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
+        except Exception as e:
+            log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
+        else:
            log.debug(f"Tenant {tenant_id} data: {tenant}")
            if tenant["state"]["slug"] == expected_state:
                return tenant
-        except Exception as e:
-            log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
+            if tenant["state"]["slug"] == "Broken":
+                raise RuntimeError(f"tenant became Broken, not {expected_state}")

        time.sleep(period)

--- a/test_runner/performance/test_pageserver_startup_many_tenants.py
+++ b/test_runner/performance/test_pageserver_startup_many_tenants.py
@@ -1,52 +0,0 @@
-import queue
-import threading
-from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
-from fixtures.types import TenantId
-
-"""
-553  sudo mkfs.ext4 /dev/nvme1n1
-555  mkdir test_output
-556  sudo mount /dev/nvme1n1 test_output
-557  htop
-559  ./scripts/pysync
-560  NEON_BIN=/home/admin/neon/target/release DEFAULT_PG_VERSION=15 ./scripts/pytest --preserve-database-files --timeout=0 ./test_runner/performance/test_pageserver_startup_many_tenants.py
-561  sudo chown -R admin:admin test_output
-
-cargo build_testing --release
-
-562  NEON_BIN=$PWD/target/release DEFAULT_PG_VERSION=15 ./scripts/pytest --preserve-database-files --timeout=0 ./test_runner/performance/test_pageserver_startup_many_tenants.py
-
-cd test_output/test_pageserver_startup_many_tenants/repo
-
-sudo env  NEON_REPO_DIR=$PWD prlimit --nofile=300000:300000  ../../../target/release/neon_local start
-# watch initial load complete, then background jobs start. That's the interesting part.
-sudo env  NEON_REPO_DIR=$PWD prlimit --nofile=300000:300000  ../../../target/release/neon_local stop
-# usually pageserver won't be responsive, kill with
-sudo pkill -9 pageserver
-"""
-def test_pageserver_startup_many_tenants(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    env = neon_env_builder.init_start()
-
-    #  below doesn't work because summaries contain tenant and timeline ids and we check for them
-
-    tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
-    pshttp = env.pageserver.http_client()
-    ep = env.endpoints.create_start("main")
-    ep.safe_psql("create table foo(b text)")
-    for i in range(0, 8):
-        ep.safe_psql("insert into foo(b) values ('some text')")
-        # pg_bin.run_capture(["pgbench", "-i", "-s1", ep.connstr()])
-        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
-        pshttp.timeline_checkpoint(tenant_id, timeline_id)
-    ep.stop_and_destroy()
-
-    env.pageserver.stop()
-    for sk in env.safekeepers:
-        sk.stop()
-
-    tenant_dir = env.repo_dir / "pageserver_1" / "tenants" / str(env.initial_tenant)
-
-    for i in range(0, 20_000):
-        import shutil
-
-        shutil.copytree(tenant_dir, tenant_dir.parent / str(TenantId.generate()))
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -1,14 +1,24 @@
 import random
 import threading
 import time
-from typing import List
+from queue import SimpleQueue
+from typing import Any, Dict, List, Union

 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import Endpoint, NeonEnv, PgBin
-from fixtures.types import Lsn
+from fixtures.neon_fixtures import (
+    Endpoint,
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+)
+from fixtures.pageserver.http import PageserverApiException
+from fixtures.pageserver.utils import wait_until_tenant_active
+from fixtures.types import Lsn, TimelineId
 from fixtures.utils import query_scalar
 from performance.test_perf_pgbench import get_scales_matrix
+from requests import RequestException
+from requests.exceptions import RetryError


 # Test branch creation
@@ -128,3 +138,245 @@ def test_branching_unnormalized_start_lsn(neon_simple_env: NeonEnv, pg_bin: PgBi
    endpoint1 = env.endpoints.create_start("b1")

    pg_bin.run_capture(["pgbench", "-i", endpoint1.connstr()])
+
+
+def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonEnvBuilder):
+    """
+    Endpoint should not be possible to create because branch has not been uploaded.
+    """
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    env.pageserver.allowed_errors.append(
+        ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
+    )
+    env.pageserver.allowed_errors.append(
+        ".*page_service_conn_main.*: query handler for 'basebackup .* is not active, state: Loading"
+    )
+    ps_http = env.pageserver.http_client()
+
+    # pause all uploads
+    ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
+    ps_http.tenant_create(env.initial_tenant)
+
+    initial_branch = "initial_branch"
+
+    def start_creating_timeline():
+        with pytest.raises(RequestException):
+            ps_http.timeline_create(
+                env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
+            )
+
+    t = threading.Thread(target=start_creating_timeline)
+    try:
+        t.start()
+
+        wait_until_paused(env, "before-upload-index-pausable")
+
+        env.neon_cli.map_branch(initial_branch, env.initial_tenant, env.initial_timeline)
+
+        with pytest.raises(RuntimeError, match="is not active, state: Loading"):
+            env.endpoints.create_start(initial_branch, tenant_id=env.initial_tenant)
+    finally:
+        # FIXME: paused uploads bother shutdown
+        env.pageserver.stop(immediate=True)
+
+        t.join()
+
+
+def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder):
+    """
+    Branch should not be possible to create because ancestor has not been uploaded.
+    """
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    env.pageserver.allowed_errors.append(
+        ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
+    )
+    ps_http = env.pageserver.http_client()
+
+    # pause all uploads
+    ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
+    ps_http.tenant_create(env.initial_tenant)
+
+    def start_creating_timeline():
+        with pytest.raises(RequestException):
+            ps_http.timeline_create(
+                env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
+            )
+
+    t = threading.Thread(target=start_creating_timeline)
+    try:
+        t.start()
+
+        wait_until_paused(env, "before-upload-index-pausable")
+
+        branch_id = TimelineId.generate()
+
+        with pytest.raises(RetryError, match="too many 503 error responses"):
+            ps_http.timeline_create(
+                env.pg_version,
+                env.initial_tenant,
+                branch_id,
+                ancestor_timeline_id=env.initial_timeline,
+            )
+
+        with pytest.raises(
+            PageserverApiException,
+            match=f"NotFound: Timeline {env.initial_tenant}/{branch_id} was not found",
+        ):
+            ps_http.timeline_detail(env.initial_tenant, branch_id)
+            # important to note that a task might still be in progress to complete
+            # the work, but will never get to that because we have the pause
+            # failpoint
+    finally:
+        # FIXME: paused uploads bother shutdown
+        env.pageserver.stop(immediate=True)
+
+        t.join()
+
+
+def test_competing_branchings_from_loading_race_to_ok_or_err(neon_env_builder: NeonEnvBuilder):
+    """
+    If the activate only after upload is used, then retries could become competing.
+    """
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    env.pageserver.allowed_errors.append(
+        ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
+    )
+    env.pageserver.allowed_errors.append(
+        ".*Error processing HTTP request: InternalServerError\\(Timeline .*/.* already exists in pageserver's memory"
+    )
+    ps_http = env.pageserver.http_client()
+
+    # pause all uploads
+    ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
+    ps_http.tenant_create(env.initial_tenant)
+
+    def start_creating_timeline():
+        ps_http.timeline_create(
+            env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
+        )
+
+    create_root = threading.Thread(target=start_creating_timeline)
+
+    branch_id = TimelineId.generate()
+
+    queue: SimpleQueue[Union[Dict[Any, Any], Exception]] = SimpleQueue()
+    barrier = threading.Barrier(3)
+
+    def try_branch():
+        barrier.wait()
+        barrier.wait()
+        try:
+            ret = ps_http.timeline_create(
+                env.pg_version,
+                env.initial_tenant,
+                branch_id,
+                ancestor_timeline_id=env.initial_timeline,
+                timeout=5,
+            )
+            queue.put(ret)
+        except Exception as e:
+            queue.put(e)
+
+    threads = [threading.Thread(target=try_branch) for _ in range(2)]
+
+    try:
+        create_root.start()
+
+        for t in threads:
+            t.start()
+
+        wait_until_paused(env, "before-upload-index-pausable")
+
+        barrier.wait()
+        ps_http.configure_failpoints(("before-upload-index-pausable", "off"))
+        barrier.wait()
+
+        # now both requests race to branch, only one can win because they take gc_cs, Tenant::timelines or marker files
+        first = queue.get()
+        second = queue.get()
+
+        log.info(first)
+        log.info(second)
+
+        (succeeded, failed) = (first, second) if isinstance(second, Exception) else (second, first)
+        assert isinstance(failed, Exception)
+        assert isinstance(succeeded, Dict)
+
+        # FIXME: there's probably multiple valid status codes:
+        # - Timeline 62505b9a9f6b1d29117b1b74eaf07b12/56cd19d3b2dbcc65e9d53ec6ca304f24 already exists
+        # - whatever 409 response says, but that is a subclass of PageserverApiException
+        assert isinstance(failed, PageserverApiException)
+        assert succeeded["state"] == "Active"
+    finally:
+        # we might still have the failpoint active
+        env.pageserver.stop(immediate=True)
+
+        # pytest should nag if we leave threads unjoined
+        for t in threads:
+            t.join()
+        create_root.join()
+
+
+def test_non_uploaded_branch_availability_after_restart(neon_env_builder: NeonEnvBuilder):
+    """
+    Currently before RFC#27 we keep and continue uploading branches which were not successfully uploaded before shutdown.
+
+    This test likely duplicates some other test, but it's easier to write one than to make sure there will be a failing test when the rfc is implemented.
+    """
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    env.pageserver.allowed_errors.append(
+        ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
+    )
+    ps_http = env.pageserver.http_client()
+
+    # pause all uploads
+    ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
+    ps_http.tenant_create(env.initial_tenant)
+
+    def start_creating_timeline():
+        with pytest.raises(RequestException):
+            ps_http.timeline_create(
+                env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
+            )
+
+    t = threading.Thread(target=start_creating_timeline)
+    try:
+        t.start()
+
+        wait_until_paused(env, "before-upload-index-pausable")
+    finally:
+        # FIXME: paused uploads bother shutdown
+        env.pageserver.stop(immediate=True)
+        t.join()
+
+    # now without a failpoint
+    env.pageserver.start()
+
+    wait_until_tenant_active(ps_http, env.initial_tenant)
+
+    # currently it lives on and will get eventually uploaded, but this will change
+    detail = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
+    assert detail["state"] == "Active"
+
+
+def wait_until_paused(env: NeonEnv, failpoint: str):
+    found = False
+    msg = f"at failpoint {failpoint}"
+    for _ in range(20):
+        time.sleep(1)
+        found = env.pageserver.log_contains(msg) is not None
+        if found:
+            break
+    assert found
--- a/test_runner/regress/test_duplicate_layers.py
+++ b/test_runner/regress/test_duplicate_layers.py
@@ -3,7 +3,10 @@ import time
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
-from fixtures.pageserver.utils import wait_for_upload_queue_empty
+from fixtures.pageserver.utils import (
+    wait_for_upload_queue_empty,
+    wait_until_tenant_active,
+)
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from requests.exceptions import ConnectionError

@@ -113,6 +116,8 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
    time.sleep(1)

    env.pageserver.start()
+    wait_until_tenant_active(pageserver_http, tenant_id)
+
    message = f".*duplicated L1 layer layer={l1_found.name}"
    env.pageserver.allowed_errors.append(message)

--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -10,6 +10,7 @@ of the pageserver are:
 """


+import enum
 import re
 import time
 from typing import Optional
@@ -81,7 +82,7 @@ def generate_uploads_and_deletions(
                    f"""
                INSERT INTO foo (id, val)
                SELECT g, '{data}'
-                FROM generate_series(1, 20000) g
+                FROM generate_series(1, 200) g
                ON CONFLICT (id) DO UPDATE
                SET val = EXCLUDED.val
                """,
@@ -116,6 +117,10 @@ def get_deletion_queue_submitted(ps_http) -> int:
    return get_metric_or_0(ps_http, "pageserver_deletion_queue_submitted_total")


+def get_deletion_queue_validated(ps_http) -> int:
+    return get_metric_or_0(ps_http, "pageserver_deletion_queue_validated_total")
+
+
 def get_deletion_queue_dropped(ps_http) -> int:
    return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_total")

@@ -272,13 +277,29 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
    assert get_deletion_queue_unexpected_errors(ps_http) == 0


-@pytest.mark.parametrize("keep_attachment", [True, False])
+class KeepAttachment(str, enum.Enum):
+    KEEP = "keep"
+    LOSE = "lose"
+
+
+class ValidateBefore(str, enum.Enum):
+    VALIDATE = "validate"
+    NO_VALIDATE = "no-validate"
+
+
+@pytest.mark.parametrize("keep_attachment", [KeepAttachment.KEEP, KeepAttachment.LOSE])
+@pytest.mark.parametrize("validate_before", [ValidateBefore.VALIDATE, ValidateBefore.NO_VALIDATE])
 def test_deletion_queue_recovery(
-    neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, keep_attachment: bool
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+    keep_attachment: KeepAttachment,
+    validate_before: ValidateBefore,
 ):
    """
-    :param keep_attachment: If true, we re-attach after restart.  Else, we act as if some other
+    :param keep_attachment: whether to re-attach after restart.  Else, we act as if some other
    node took the attachment while we were restarting.
+    :param validate_before: whether to wait for deletions to be validated before restart.  This
+    makes them elegible to be executed after restart, if the same node keeps the attachment.
    """
    neon_env_builder.enable_generations = True
    neon_env_builder.enable_pageserver_remote_storage(
@@ -288,12 +309,20 @@ def test_deletion_queue_recovery(

    ps_http = env.pageserver.http_client()

-    # Prevent deletion lists from being executed, to build up some backlog of deletions
-    ps_http.configure_failpoints(
-        [
-            ("deletion-queue-before-execute", "return"),
-        ]
-    )
+    failpoints = [
+        # Prevent deletion lists from being executed, to build up some backlog of deletions
+        ("deletion-queue-before-execute", "return"),
+    ]
+
+    if validate_before == ValidateBefore.NO_VALIDATE:
+        failpoints.append(
+            # Prevent deletion lists from being validated, we will test that they are
+            # dropped properly during recovery.  'pause' is okay here because we kill
+            # the pageserver with immediate=true
+            ("control-plane-client-validate", "pause")
+        )
+
+    ps_http.configure_failpoints(failpoints)

    generate_uploads_and_deletions(env)

@@ -305,10 +334,25 @@ def test_deletion_queue_recovery(
    assert get_deletion_queue_unexpected_errors(ps_http) == 0
    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0

+    if validate_before == ValidateBefore.VALIDATE:
+
+        def assert_validation_complete():
+            assert get_deletion_queue_submitted(ps_http) == get_deletion_queue_validated(ps_http)
+
+        wait_until(20, 1, assert_validation_complete)
+
+        # The validatated keys statistic advances before the header is written, so we
+        # also wait to see the header hit the disk: this seems paranoid but the race
+        # can really happen on a heavily overloaded test machine.
+        def assert_header_written():
+            assert (env.pageserver.workdir / "deletion" / "header-01").exists()
+
+        wait_until(20, 1, assert_header_written)
+
    log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
    env.pageserver.stop(immediate=True)

-    if not keep_attachment:
+    if keep_attachment == KeepAttachment.LOSE:
        some_other_pageserver = 101010
        assert env.attachment_service is not None
        env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver)
@@ -327,14 +371,17 @@ def test_deletion_queue_recovery(
    ps_http.deletion_queue_flush(execute=True)
    wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))

-    if keep_attachment:
-        # If we kept the attachment, then our pre-restart deletions should have executed
-        # successfully
+    if keep_attachment == KeepAttachment.KEEP or validate_before == ValidateBefore.VALIDATE:
+        # - If we kept the attachment, then our pre-restart deletions should execute
+        #   because on re-attach they were from the immediately preceding generation
+        # - If we validated before restart, then the deletions should execute because the
+        #   deletion queue header records a validated deletion list sequence number.
        assert get_deletion_queue_executed(ps_http) == before_restart_depth
    else:
+        env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
+
        # If we lost the attachment, we should have dropped our pre-restart deletions.
        assert get_deletion_queue_dropped(ps_http) == before_restart_depth
-        env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])

    assert get_deletion_queue_unexpected_errors(ps_http) == 0
    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
@@ -350,3 +397,73 @@ def test_deletion_queue_recovery(

    assert get_deletion_queue_unexpected_errors(ps_http) == 0
    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
+
+
+def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    neon_env_builder.enable_generations = True
+    neon_env_builder.enable_pageserver_remote_storage(
+        RemoteStorageKind.MOCK_S3,
+    )
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+
+    ps_http = env.pageserver.http_client()
+
+    generate_uploads_and_deletions(env)
+
+    env.pageserver.allowed_errors.extend(
+        [
+            # When the pageserver can't reach the control plane, it will complain
+            ".*calling control plane generation validation API failed.*",
+            # Emergency mode is a big deal, we log errors whenever it is used.
+            ".*Emergency mode!.*",
+        ]
+    )
+
+    # Simulate a major incident: the control plane goes offline
+    assert env.attachment_service is not None
+    env.attachment_service.stop()
+
+    # Remember how many validations had happened before the control plane went offline
+    validated = get_deletion_queue_validated(ps_http)
+
+    generate_uploads_and_deletions(env, init=False)
+
+    # The running pageserver should stop progressing deletions
+    time.sleep(10)
+    assert get_deletion_queue_validated(ps_http) == validated
+
+    # Restart the pageserver: ordinarily we would _avoid_ doing this during such an
+    # incident, but it might be unavoidable: if so, we want to be able to start up
+    # and serve clients.
+    env.pageserver.stop()  # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
+    env.pageserver.start(
+        overrides=("--pageserver-config-override=control_plane_emergency_mode=true",)
+    )
+
+    # The pageserver should provide service to clients
+    generate_uploads_and_deletions(env, init=False)
+
+    # The pageserver should neither validate nor execute any deletions, it should have
+    # loaded the DeletionLists from before though
+    time.sleep(10)
+    assert get_deletion_queue_depth(ps_http) > 0
+    assert get_deletion_queue_validated(ps_http) == 0
+    assert get_deletion_queue_executed(ps_http) == 0
+
+    # When the control plane comes back up, normal service should resume
+    env.attachment_service.start()
+
+    ps_http.deletion_queue_flush(execute=True)
+    assert get_deletion_queue_depth(ps_http) == 0
+    assert get_deletion_queue_validated(ps_http) > 0
+    assert get_deletion_queue_executed(ps_http) > 0
+
+    # The pageserver should work fine when subsequently restarted in non-emergency mode
+    env.pageserver.stop()  # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
+    env.pageserver.start()
+
+    generate_uploads_and_deletions(env, init=False)
+    ps_http.deletion_queue_flush(execute=True)
+    assert get_deletion_queue_depth(ps_http) == 0
+    assert get_deletion_queue_validated(ps_http) > 0
+    assert get_deletion_queue_executed(ps_http) > 0
--- a/test_runner/regress/test_pageserver_restarts_under_workload.py
+++ b/test_runner/regress/test_pageserver_restarts_under_workload.py
@@ -17,6 +17,8 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB
    n_restarts = 10
    scale = 10

+    env.pageserver.allowed_errors.append(".*query handler.*failed.*Shutting down")
+
    def run_pgbench(connstr: str):
        log.info(f"Start a pgbench workload on pg {connstr}")
        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -45,14 +45,11 @@ def test_tenant_delete_smoke(
        [
            # The deletion queue will complain when it encounters simulated S3 errors
            ".*deletion executor: DeleteObjects request failed.*",
+            # lucky race with stopping from flushing a layer we fail to schedule any uploads
+            ".*layer flush task.+: could not flush frozen layer: update_metadata_file",
        ]
    )

-    # lucky race with stopping from flushing a layer we fail to schedule any uploads
-    env.pageserver.allowed_errors.append(
-        ".*layer flush task.+: could not flush frozen layer: update_metadata_file"
-    )
-
    ps_http = env.pageserver.http_client()

    # first try to delete non existing tenant
@@ -194,11 +191,9 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
    )

    if simulate_failures:
-        env.pageserver.allowed_errors.extend(
-            [
-                # The deletion queue will complain when it encounters simulated S3 errors
-                ".*deletion executor: DeleteObjects request failed.*",
-            ]
+        env.pageserver.allowed_errors.append(
+            # The deletion queue will complain when it encounters simulated S3 errors
+            ".*deletion executor: DeleteObjects request failed.*",
        )

    ps_http = env.pageserver.http_client()
@@ -293,6 +288,10 @@ def test_tenant_delete_is_resumed_on_attach(
    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+    env.pageserver.allowed_errors.append(
+        # lucky race with stopping from flushing a layer we fail to schedule any uploads
+        ".*layer flush task.+: could not flush frozen layer: update_metadata_file"
+    )

    tenant_id = env.initial_tenant

--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -752,6 +752,9 @@ def test_ignore_while_attaching(
    env.pageserver.allowed_errors.append(
        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
    )
+    # An endpoint is starting up concurrently with our detach, it can
+    # experience RPC failure due to shutdown.
+    env.pageserver.allowed_errors.append(".*query handler.*failed.*Shutting down")

    data_id = 1
    data_secret = "very secret secret"
Author	SHA1	Message	Date
John Spray	7f3670a589	pageserver: don't log deletion S3 op failures as errors	2023-10-11 14:26:01 +01:00
Alex Chi Z	5158de70f3	proxy: breakdown wake up failure metrics (#4933 ) ## Problem close https://github.com/neondatabase/neon/issues/4702 ## Summary of changes This PR adds a new metrics for wake up errors and breaks it down by most common reasons (mostly follows the `could_retry` implementation).	2023-10-10 13:17:37 +01:00
khanova	aec9188d36	Added timeout for http requests (#5514 ) # Problem Proxy timeout for HTTP-requests ## Summary of changes If the HTTP-request exceeds 15s, it would be killed. Resolves: https://github.com/neondatabase/neon/issues/4847	2023-10-10 13:39:38 +02:00
John Spray	acefee9a32	pageserver: flush deletion queue on detach (#5452 ) ## Problem If a caller detaches a tenant and then attaches it again, pending deletions from the old attachment might not have happened yet. This is not a correctness problem, but it causes: - Risk of leaking some objects in S3 - Some warnings from the deletion queue when pending LSN updates and pending deletions don't pass validation. ## Summary of changes - Deletion queue now uses UnboundedChannel so that the push interfaces don't have to be async. - This was pulled out of https://github.com/neondatabase/neon/pull/5397, where it is also useful to be able to drive the queue from non-async contexts. - Why is it okay for this to be unbounded? The only way the unbounded-ness of the channel can become a problem is if writing out deletion lists can't keep up, but if the system were that overloaded then the code generating deletions (GC, compaction) would also be impacted. - DeletionQueueClient gets a new `flush_advisory` function, which is like flush_execute, but doesn't wait for completion: this is appropriate for use in contexts where we would like to encourage the deletion queue to flush, but don't need to block on it. - This function is also expected to be useful in next steps for seamless migration, where the option to flush to S3 while transitioning into AttachedStale will also include flushing deletion queue, but we wouldn't want to block on that flush. - The tenant_detach code in mgr.rs invokes flush_advisory after stopping the `Tenant` object. --------- Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>	2023-10-10 10:46:24 +01:00
Conrad Ludgate	bf065aabdf	proxy: update locked error retry filter (#5376 ) ## Problem We don't want to retry customer quota exhaustion errors. ## Summary of changes Make sure both types of quota exhaustion errors are not retried	2023-10-10 08:59:16 +01:00
Konstantin Knizhnik	fe74fac276	Fix handling flush error in prefetch (#5473 ) ## Problem See https://neondb.slack.com/archives/C05U648A9NJ In case of failure of flush in prefetch, prefetch state is reseted. We need to retry register buffer attempt, otherwise we will get assertion failure. ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2023-10-10 07:43:37 +03:00
Alexander Bayandin	b91ac670e1	Update plpgsql_check extension to 2.5.3 (#5437 )	2023-10-09 17:07:43 +01:00
John Spray	b3195afd20	tests: fix a race in test_deletion_queue_recovery on loaded nodes (#5495 ) ## Problem Seen in CI for https://github.com/neondatabase/neon/pull/5453 -- the time gap between validation completing and the header getting written is long enough to fail the test, where it was doing a cheeky 1 second sleep. ## Summary of changes - Replace 1 second sleep with a wait_until to see the header file get written - Use enums as test params to make the results more readable (instead of True-False parameters) - Fix the temp suffix used for deletion queue headers: this worked fine, but resulted in `..tmp` extension.	2023-10-09 16:28:28 +01:00
John Spray	7eaa7a496b	pageserver: cancellation handling in writes to postgres client socket (#5503 ) ## Problem Writes to the postgres client socket from the page server were not wrapped in cancellation handling, so a stuck client connection could prevent tenant shutdowwn. ## Summary of changes All the places we call flush() to write to the socket, we should be respecting the cancellation token for the task. In this PR, I explicitly pass around a CancellationToken rather than doing inline `task_mgr::shutdown_token` calls, to avoid coupling it to the global task_mgr state and make it easier to refactor later. I have some follow-on commits that add a Shutdown variant to QueryError and use it more extensively, but that's pure refactor so will keep separate from this bug fix PR. Closes: https://github.com/neondatabase/neon/issues/5341	2023-10-09 15:54:17 +01:00
Joonas Koivunen	4772cd6c93	fix: deny branching, starting compute from not yet uploaded timelines (#5484 ) Part of #5172. First commits show that we used to allow starting up a compute or creating a branch off a not yet uploaded timeline. This PR moves activation of a timeline to happen after initial layer file(s) (if any) and `index_part.json` have been uploaded. Simply moving activation to be after downloads have finished works because we now spawn a task per http request handler. Current behaviour of uploading on the timelines on next startup is kept, to be removed later as part of #5172. Adds: - `NeonCli.map_branch` and corresponding `neon_local` implementation: allow creating computes for timelines managed via pageserver http client/api - possibly duplicate tests (I did not want to search for, will cleanup in a follow-up if these duplicated) Changes: - make `wait_until_tenant_state` return immediatedly on `Broken` and not wait more	2023-10-09 17:03:38 +03:00
Shany Pozin	010b4d0d5c	Move ApiError 404 to info level (#5501 ) ## Problem Moving ApiError 404 to info level logging (see https://github.com/neondatabase/neon/pull/5489#issuecomment-1750211212)	2023-10-09 13:54:46 +03:00
Rahul Modpur	477cb3717b	Fix neon_local pageserver status command (#5475 ) ## Problem Fix neon_local pageserver status command #5430 ## Summary of changes Fix clap config for pageserver status subcommand ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. Signed-off-by: Rahul Modpur <rmodpur2@gmail.com>	2023-10-09 09:13:57 +01:00
John Spray	ea5a97e7b4	pageserver: implement emergency mode for operating without control plane (#5469 ) ## Problem Pageservers with `control_plane_api` configured require a control plane to start up: in an incident this might be a problem. ## Summary of changes Note to reviewers: most of the code churn in mgr.rs is the refactor commit that enables the later emergency mode commit: you may want to review commits separately. - Add `control_plane_emergency_mode` configuration property - Refactor init_tenant_mgr to separate loading configurations from the main loop where we construct Tenant, so that the generations fetch can peek at the configs in emergency mode. - During startup, in emergency mode, attach any tenants that were attached on their last run, using the same generation number. Closes: #5381 Closes: https://github.com/neondatabase/neon/issues/5492	2023-10-06 17:25:21 +01:00
John Spray	547914fe19	pageserver: adjust timeline deletion for generations (#5453 ) ## Problem Spun off from https://github.com/neondatabase/neon/pull/5449 Timeline deletion does the following: 1. Delete layers referenced in the index 2. Delete everything else in the timeline prefix, except the index 3. Delete the index. When generations were added, the filter in step 2 got outdated, such that the index objects were deleted along with everything else at step 2. That didn't really break anything, but it makes an automated test unhappy and is a violation of the original intent of the code, which presumably intends to upload an invariant that as long as any objects for a timeline exist, the index exists. (Eventually, this index-object-last complexity can go away: when we do https://github.com/neondatabase/neon/issues/5080, there is no need to keep the index_part around, as deletions can always be retried any time any where.) ## Summary of changes After object listing, split the listed objects into layers and index objects. Delete the layers first, then the index objects.	2023-10-06 16:15:18 +00:00
Arpad Müller	607b185a49	Fix 1.73.0 clippy lints (#5494 ) Doesn't do an upgrade of rustc to 1.73.0 as we want to wait for the cargo response of the curl CVE before updating. In preparation for an update, we address the clippy lints that are newly firing in 1.73.0.	2023-10-06 14:17:19 +01:00
Christian Schwarz	bfba5e3aca	page_cache: ensure forward progress on miss (#5482 ) Problem ======= Prior to this PR, when we had a cache miss, we'd get back a write guard, fill it, the drop it and retry the read from cache. If there's severe contention for the cache, it could happen that the just-filled data gets evicted before our retry, resulting in lost work and no forward progress. Solution ======== This PR leverages the now-available `tokio::sync::RwLockWriteGuard`'s `downgrade()` functionality to turn the filled slot write guard into a read guard. We don't drop the guard at any point, so, forward progress is ensured. Refs ==== Stacked atop https://github.com/neondatabase/neon/pull/5480 part of https://github.com/neondatabase/neon/issues/4743 specifically part of https://github.com/neondatabase/neon/issues/5479	2023-10-06 13:41:13 +01:00
Christian Schwarz	ecc7a9567b	page_cache: inline `{,try_}lock_for_write` into `memorize_materialized_page` (#5480 ) Motivation ========== It's the only user, and the name of `_for_write` is wrong as of commit `7a63685cde` Author: Christian Schwarz <christian@neon.tech> Date: Fri Aug 18 19:31:03 2023 +0200 simplify page-caching of EphemeralFile (#4994) Notes ===== This also allows us to get rid of the WriteBufResult type. Also rename `search_mapping_for_write` to `search_mapping_exact`. It makes more sense that way because there is `_for_write`-locking anymore. Refs ==== part of https://github.com/neondatabase/neon/issues/4743 specifically https://github.com/neondatabase/neon/issues/5479 this is prep work for https://github.com/neondatabase/neon/pull/5482	2023-10-06 13:38:02 +02:00
Joonas Koivunen	45f98dd018	debug_tool: get page at lsn and keyspace via http api (#5057 ) If there are any layermap or layer file related problems, having a reproducable `get_page@lsn` easily usable for fast debugging iteration is helpful. Split off from #4938. Later evolved to add http apis for: - `get_page@lsn` at `/v1/tenant/:tenant_id/timeline/:timeline_id/get?key=<hex>&lsn=<lsn string>` - collecting the keyspace at `/v1/tenant/:tenant_id/timeline/:timeline_id/keyspace?[at_lsn=<lsn string>]` - defaults to `last_record_lsn` collecting the keyspace seems to yield some ranges for which there is no key.	2023-10-06 12:17:38 +01:00
John Spray	bdfe27f3ac	swagger: add a 503 definition to each endpoint (#5476 ) ## Problem The control plane doesn't have generic handling for this. ## Summary of changes Add a 503 response to every endpoint.	2023-10-06 11:31:49 +01:00
Joonas Koivunen	a15f9b3baa	pageserver: Tune 503 Resource unavailable (#5489 ) 503 Resource Unavailable appears as error in logs, but is not really an error which should ever fail a test on, or even log an error in prod, [evidence]. Changes: - log 503 as `info!` level - use `Cow<'static, str>` instead of `String` - add an additional `wait_until_tenant_active` in `test_actually_duplicate_l1` We ought to have in tests "wait for tenants to complete loading" but this is easier to implement for now. [evidence]: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5485/6423110295/index.html#/testresult/182de66203864fc0	2023-10-06 09:59:14 +01:00
Alexander Bayandin	ce92638185	test_runner: allow race in test_tenant_delete_is_resumed_on_attach (#5478 ) ## Problem `test_tenant_delete_is_resumed_on_attach` is flaky ## Summary of changes - Allow race in `test_tenant_delete_is_resumed_on_attach` - Cleanup `allowed_errors` in the file a bit	2023-10-06 09:49:31 +01:00
Joonas Koivunen	a3c82f19b8	tests: prettier subprocess output in test log (#5485 ) Clean subprocess output so that: - one line of output is just one line without a linebreak - like shells handle `echo subshell says: $(echo foo)` - multiple lines are indented like other pytest output - error output is dedented and then indented to be like other pytest output Minor readability changes remove friction.	2023-10-05 20:15:55 +00:00
Arthur Petukhovsky	8b15252f98	Move walproposer state into struct (#5364 ) This patch extracts all postgres-dependent functions in a separate `walproposer_api` functions struct. It helps to compile walproposer as static library without compiling all other postgres server code. This is useful to allow calling walproposer C code from Rust, or linking this library with anything else. All global variables containing walproposer state were extracted to a separate `WalProposer` struct. This makes it possible to run several walproposers in the same process, in separate threads. There were no logic changes and PR mostly consists of shuffling functions between several files. We have a good test coverage for walproposer code and I've seen no issues with tests while I was refactoring it, so I don't expect any issues after merge. ref https://github.com/neondatabase/neon/issues/547 --------- Co-authored-by: Arseny Sher <sher-ars@yandex.ru>	2023-10-05 18:48:01 +01:00
Alexander Bayandin	522aaca718	Temporary deploy staging preprod region from main (#5477 ) ## Problem Stating preprod region can't use `release-XXX` right now, the config is unified across all regions, it supports only `XXX`. Ref https://neondb.slack.com/archives/C03H1K0PGKH/p1696506459720909?thread_ts=1696437812.365249&cid=C03H1K0PGKH ## Summary of changes - Deploy staging-preprod from main	2023-10-05 14:02:20 +00:00
John Spray	7cbb39063a	tests: stabilize + extend deletion queue recovery test (#5457 ) ## Problem This test was unstable when run in parallel with lots of others: if the pageserver stayed up long enough for some of the deletions to get validated, they won't be discarded on restart the way the test expects when keep_attachment=True. This was a test bug, not a pageserver bug. ## Summary of changes - Add failpoints to control plane api client - Use failpoint to pause validation in the test to cover the case where it had been flaky - Add a metric for the number of deleted keys validated - Add a permutation to the test to additionally exercise the case where we _do_ validate lists before restart: this is a coverage enhancement that seemed sensible when realizing that the test was relying on nothing being validated before restart. - the test will now always enter the restart with nothing or everything validated.	2023-10-05 11:22:05 +01:00