another things

Merge branch 'main' into ephemeralsad/graceful-draining
Better storcon API
2026-05-18 05:30:37 +00:00 · 2025-06-11 09:19:07 +04:00 · 2025-06-09 09:54:13 +04:00 · 2025-06-06 10:56:58 +04:00 · 2025-06-05 18:25:29 +04:00 · 2025-06-02 17:07:30 +04:00
62 changed files with 1217 additions and 1404 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -753,7 +753,6 @@ dependencies = [
 "axum",
 "axum-core",
 "bytes",
- "form_urlencoded",
 "futures-util",
 "headers",
 "http 1.1.0",
@@ -762,8 +761,6 @@ dependencies = [
 "mime",
 "pin-project-lite",
 "serde",
- "serde_html_form",
- "serde_path_to_error",
 "tower 0.5.2",
 "tower-layer",
 "tower-service",
@@ -6425,19 +6422,6 @@ dependencies = [
 "syn 2.0.100",
 ]

-[[package]]
-name = "serde_html_form"
-version = "0.2.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9d2de91cf02bbc07cde38891769ccd5d4f073d22a40683aa4bc7a95781aaa2c4"
-dependencies = [
- "form_urlencoded",
- "indexmap 2.9.0",
- "itoa",
- "ryu",
- "serde",
-]
-
 [[package]]
 name = "serde_json"
 version = "1.0.125"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -71,7 +71,7 @@ aws-credential-types = "1.2.0"
 aws-sigv4 = { version = "1.2", features = ["sign-http"] }
 aws-types = "1.3"
 axum = { version = "0.8.1", features = ["ws"] }
-axum-extra = { version = "0.10.0", features = ["typed-header", "query"] }
+axum-extra = { version = "0.10.0", features = ["typed-header"] }
 base64 = "0.13.0"
 bincode = "1.3"
 bindgen = "0.71"
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -785,7 +785,7 @@ impl ComputeNode {
        self.spawn_extension_stats_task();

        if pspec.spec.autoprewarm {
-            self.prewarm_lfc(None);
+            self.prewarm_lfc();
        }
        Ok(())
    }
--- a/compute_tools/src/compute_prewarm.rs
+++ b/compute_tools/src/compute_prewarm.rs
@@ -25,16 +25,11 @@ struct EndpointStoragePair {
 }

 const KEY: &str = "lfc_state";
-impl EndpointStoragePair {
-    /// endpoint_id is set to None while prewarming from other endpoint, see replica promotion
-    /// If not None, takes precedence over pspec.spec.endpoint_id
-    fn from_spec_and_endpoint(
-        pspec: &crate::compute::ParsedSpec,
-        endpoint_id: Option<String>,
-    ) -> Result<Self> {
-        let endpoint_id = endpoint_id.as_ref().or(pspec.spec.endpoint_id.as_ref());
-        let Some(ref endpoint_id) = endpoint_id else {
-            bail!("pspec.endpoint_id missing, other endpoint_id not provided")
+impl TryFrom<&crate::compute::ParsedSpec> for EndpointStoragePair {
+    type Error = anyhow::Error;
+    fn try_from(pspec: &crate::compute::ParsedSpec) -> Result<Self, Self::Error> {
+        let Some(ref endpoint_id) = pspec.spec.endpoint_id else {
+            bail!("pspec.endpoint_id missing")
        };
        let Some(ref base_uri) = pspec.endpoint_storage_addr else {
            bail!("pspec.endpoint_storage_addr missing")
@@ -89,7 +84,7 @@ impl ComputeNode {
    }

    /// Returns false if there is a prewarm request ongoing, true otherwise
-    pub fn prewarm_lfc(self: &Arc<Self>, from_endpoint: Option<String>) -> bool {
+    pub fn prewarm_lfc(self: &Arc<Self>) -> bool {
        crate::metrics::LFC_PREWARM_REQUESTS.inc();
        {
            let state = &mut self.state.lock().unwrap().lfc_prewarm_state;
@@ -102,7 +97,7 @@ impl ComputeNode {

        let cloned = self.clone();
        spawn(async move {
-            let Err(err) = cloned.prewarm_impl(from_endpoint).await else {
+            let Err(err) = cloned.prewarm_impl().await else {
                cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Completed;
                return;
            };
@@ -114,14 +109,13 @@ impl ComputeNode {
        true
    }

-    /// from_endpoint: None for endpoint managed by this compute_ctl
-    fn endpoint_storage_pair(&self, from_endpoint: Option<String>) -> Result<EndpointStoragePair> {
+    fn endpoint_storage_pair(&self) -> Result<EndpointStoragePair> {
        let state = self.state.lock().unwrap();
-        EndpointStoragePair::from_spec_and_endpoint(state.pspec.as_ref().unwrap(), from_endpoint)
+        state.pspec.as_ref().unwrap().try_into()
    }

-    async fn prewarm_impl(&self, from_endpoint: Option<String>) -> Result<()> {
-        let EndpointStoragePair { url, token } = self.endpoint_storage_pair(from_endpoint)?;
+    async fn prewarm_impl(&self) -> Result<()> {
+        let EndpointStoragePair { url, token } = self.endpoint_storage_pair()?;
        info!(%url, "requesting LFC state from endpoint storage");

        let request = Client::new().get(&url).bearer_auth(token);
@@ -179,7 +173,7 @@ impl ComputeNode {
    }

    async fn offload_lfc_impl(&self) -> Result<()> {
-        let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?;
+        let EndpointStoragePair { url, token } = self.endpoint_storage_pair()?;
        info!(%url, "requesting LFC state from postgres");

        let mut compressed = Vec::new();
--- a/compute_tools/src/http/routes/lfc.rs
+++ b/compute_tools/src/http/routes/lfc.rs
@@ -2,7 +2,6 @@ use crate::compute_prewarm::LfcPrewarmStateWithProgress;
 use crate::http::JsonResponse;
 use axum::response::{IntoResponse, Response};
 use axum::{Json, http::StatusCode};
-use axum_extra::extract::OptionalQuery;
 use compute_api::responses::LfcOffloadState;
 type Compute = axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>;

@@ -17,16 +16,8 @@ pub(in crate::http) async fn offload_state(compute: Compute) -> Json<LfcOffloadS
    Json(compute.lfc_offload_state())
 }

-#[derive(serde::Deserialize)]
-pub struct PrewarmQuery {
-    pub from_endpoint: String,
-}
-
-pub(in crate::http) async fn prewarm(
-    compute: Compute,
-    OptionalQuery(query): OptionalQuery<PrewarmQuery>,
-) -> Response {
-    if compute.prewarm_lfc(query.map(|q| q.from_endpoint)) {
+pub(in crate::http) async fn prewarm(compute: Compute) -> Response {
+    if compute.prewarm_lfc() {
        StatusCode::ACCEPTED.into_response()
    } else {
        JsonResponse::error(
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -65,6 +65,10 @@ enum Command {
    NodeDelete {
        #[arg(long)]
        node_id: NodeId,
+
+        /// Force flag to delete the node without draining
+        #[arg(long)]
+        force: bool,
    },
    /// Delete a tombstone of node from the storage controller.
    NodeDeleteTombstone {
@@ -215,6 +219,8 @@ enum Command {
    StartDrain {
        #[arg(long)]
        node_id: NodeId,
+        #[arg(long)]
+        drain_all: Option<bool>,
    },
    /// Cancel draining the specified pageserver and wait for `timeout`
    /// for the operation to be canceled. May be retried.
@@ -903,7 +909,39 @@ async fn main() -> anyhow::Result<()> {
                .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
                .await?;
        }
-        Command::NodeDelete { node_id } => {
+        Command::NodeDelete { node_id, force } => {
+            // If force is not set, we need to drain the node first
+            // This prevents the node from being deleted while there are still tenants on it
+            if !force {
+                match &storcon_client
+                    .dispatch::<(), NodeDescribeResponse>(
+                        Method::GET,
+                        format!("control/v1/node/{node_id}?drain_all=true"),
+                        None,
+                    )
+                    .await?
+                    .scheduling
+                {
+                    NodeSchedulingPolicy::Draining | NodeSchedulingPolicy::PauseForRestart => {
+                        println!("Node {} is already draining", node_id);
+                    }
+                    _ => {
+                        println!("Node {} is not draining, starting drain", node_id);
+                        storcon_client
+                            .dispatch::<(), ()>(
+                                Method::PUT,
+                                format!("control/v1/node/{node_id}/drain?graceful=true"),
+                                None,
+                            )
+                            .await?;
+                    }
+                }
+
+                // Wait for the node to be drained and printing the current state
+                watch_node_drain(&storcon_client, node_id).await?;
+            }
+
+            // Finally delete the node
            storcon_client
                .dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
                .await?;
@@ -1151,13 +1189,14 @@ async fn main() -> anyhow::Result<()> {
                failure
            );
        }
-        Command::StartDrain { node_id } => {
+        Command::StartDrain { node_id, drain_all } => {
+            let path = if drain_all == Some(true) {
+                format!("control/v1/node/{node_id}/drain?drain_all=true")
+            } else {
+                format!("control/v1/node/{node_id}/drain")
+            };
            storcon_client
-                .dispatch::<(), ()>(
-                    Method::PUT,
-                    format!("control/v1/node/{node_id}/drain"),
-                    None,
-                )
+                .dispatch::<(), ()>(Method::PUT, path, None)
                .await?;
            println!("Drain started for {node_id}");
        }
@@ -1350,3 +1389,46 @@ async fn watch_tenant_shard(
    }
    Ok(())
 }
+
+async fn watch_node_drain(storcon_client: &Client, node_id: NodeId) -> anyhow::Result<()> {
+    loop {
+        let node_desc = storcon_client
+            .dispatch::<(), NodeDescribeResponse>(
+                Method::GET,
+                format!("control/v1/node/{node_id}"),
+                None,
+            )
+            .await?;
+        let shards_count = storcon_client
+            .dispatch::<(), NodeShardResponse>(
+                Method::GET,
+                format!("control/v1/node/{node_id}/shards"),
+                None,
+            )
+            .await?
+            .shards
+            .len();
+
+        // Print the state
+        if node_desc.scheduling != NodeSchedulingPolicy::Draining {
+            if shards_count != 0 {
+                anyhow::bail!(
+                    "Node {} is not draining, but has {} shards",
+                    node_id,
+                    shards_count
+                );
+            }
+            break;
+        }
+
+        println!(
+            "Node {} is draining, {} shards remaining",
+            node_id, shards_count
+        );
+
+        tokio::time::sleep(WATCH_INTERVAL).await;
+    }
+
+    println!("Node {} is not draining", node_id);
+    Ok(())
+}
--- a/libs/proxy/tokio-postgres2/src/cancel_query.rs
+++ b/libs/proxy/tokio-postgres2/src/cancel_query.rs
@@ -1,12 +1,15 @@
+use std::io;
+
 use tokio::net::TcpStream;

 use crate::client::SocketConfig;
-use crate::config::Host;
+use crate::config::{Host, SslMode};
 use crate::tls::MakeTlsConnect;
-use crate::{Error, cancel_query_raw, connect_socket, connect_tls};
+use crate::{Error, cancel_query_raw, connect_socket};

 pub(crate) async fn cancel_query<T>(
-    config: SocketConfig,
+    config: Option<SocketConfig>,
+    ssl_mode: SslMode,
    tls: T,
    process_id: i32,
    secret_key: i32,
@@ -14,6 +17,16 @@ pub(crate) async fn cancel_query<T>(
 where
    T: MakeTlsConnect<TcpStream>,
 {
+    let config = match config {
+        Some(config) => config,
+        None => {
+            return Err(Error::connect(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                "unknown host",
+            )));
+        }
+    };
+
    let hostname = match &config.host {
        Host::Tcp(host) => &**host,
    };
@@ -29,6 +42,5 @@ where
    )
    .await?;

-    let stream = connect_tls::connect_tls(socket, config.ssl_mode, tls).await?;
-    cancel_query_raw::cancel_query_raw(stream, process_id, secret_key).await
+    cancel_query_raw::cancel_query_raw(socket, ssl_mode, tls, process_id, secret_key).await
 }
--- a/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs
+++ b/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs
@@ -2,16 +2,23 @@ use bytes::BytesMut;
 use postgres_protocol2::message::frontend;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};

-use crate::Error;
+use crate::config::SslMode;
+use crate::tls::TlsConnect;
+use crate::{Error, connect_tls};

-pub async fn cancel_query_raw<S>(
-    mut stream: S,
+pub async fn cancel_query_raw<S, T>(
+    stream: S,
+    mode: SslMode,
+    tls: T,
    process_id: i32,
    secret_key: i32,
 ) -> Result<(), Error>
 where
    S: AsyncRead + AsyncWrite + Unpin,
+    T: TlsConnect<S>,
 {
+    let mut stream = connect_tls::connect_tls(stream, mode, tls).await?;
+
    let mut buf = BytesMut::new();
    frontend::cancel_request(process_id, secret_key, &mut buf);

--- a/libs/proxy/tokio-postgres2/src/cancel_token.rs
+++ b/libs/proxy/tokio-postgres2/src/cancel_token.rs
@@ -3,21 +3,16 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::TcpStream;

 use crate::client::SocketConfig;
-use crate::tls::MakeTlsConnect;
+use crate::config::SslMode;
+use crate::tls::{MakeTlsConnect, TlsConnect};
 use crate::{Error, cancel_query, cancel_query_raw};

 /// The capability to request cancellation of in-progress queries on a
 /// connection.
-#[derive(Clone)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct CancelToken {
-    pub socket_config: SocketConfig,
-    pub raw: RawCancelToken,
-}
-
-/// The capability to request cancellation of in-progress queries on a
-/// connection.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct RawCancelToken {
+    pub socket_config: Option<SocketConfig>,
+    pub ssl_mode: SslMode,
    pub process_id: i32,
    pub secret_key: i32,
 }
@@ -41,21 +36,28 @@ impl CancelToken {
    {
        cancel_query::cancel_query(
            self.socket_config.clone(),
+            self.ssl_mode,
            tls,
-            self.raw.process_id,
-            self.raw.secret_key,
+            self.process_id,
+            self.secret_key,
+        )
+        .await
+    }
+
+    /// Like `cancel_query`, but uses a stream which is already connected to the server rather than opening a new
+    /// connection itself.
+    pub async fn cancel_query_raw<S, T>(&self, stream: S, tls: T) -> Result<(), Error>
+    where
+        S: AsyncRead + AsyncWrite + Unpin,
+        T: TlsConnect<S>,
+    {
+        cancel_query_raw::cancel_query_raw(
+            stream,
+            self.ssl_mode,
+            tls,
+            self.process_id,
+            self.secret_key,
        )
        .await
    }
 }
-
-impl RawCancelToken {
-    /// Like `cancel_query`, but uses a stream which is already connected to the server rather than opening a new
-    /// connection itself.
-    pub async fn cancel_query_raw<S>(&self, stream: S) -> Result<(), Error>
-    where
-        S: AsyncRead + AsyncWrite + Unpin,
-    {
-        cancel_query_raw::cancel_query_raw(stream, self.process_id, self.secret_key).await
-    }
-}
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -12,7 +12,6 @@ use postgres_protocol2::message::frontend;
 use serde::{Deserialize, Serialize};
 use tokio::sync::mpsc;

-use crate::cancel_token::RawCancelToken;
 use crate::codec::{BackendMessages, FrontendMessage};
 use crate::config::{Host, SslMode};
 use crate::query::RowStream;
@@ -167,7 +166,6 @@ pub struct SocketConfig {
    pub host: Host,
    pub port: u16,
    pub connect_timeout: Option<Duration>,
-    pub ssl_mode: SslMode,
 }

 /// An asynchronous PostgreSQL client.
@@ -179,6 +177,7 @@ pub struct Client {
    cached_typeinfo: CachedTypeInfo,

    socket_config: SocketConfig,
+    ssl_mode: SslMode,
    process_id: i32,
    secret_key: i32,
 }
@@ -188,6 +187,7 @@ impl Client {
        sender: mpsc::UnboundedSender<FrontendMessage>,
        receiver: mpsc::Receiver<BackendMessages>,
        socket_config: SocketConfig,
+        ssl_mode: SslMode,
        process_id: i32,
        secret_key: i32,
    ) -> Client {
@@ -205,6 +205,7 @@ impl Client {
            cached_typeinfo: Default::default(),

            socket_config,
+            ssl_mode,
            process_id,
            secret_key,
        }
@@ -330,11 +331,10 @@ impl Client {
    /// connection associated with this client.
    pub fn cancel_token(&self) -> CancelToken {
        CancelToken {
-            socket_config: self.socket_config.clone(),
-            raw: RawCancelToken {
-                process_id: self.process_id,
-                secret_key: self.secret_key,
-            },
+            socket_config: Some(self.socket_config.clone()),
+            ssl_mode: self.ssl_mode,
+            process_id: self.process_id,
+            secret_key: self.secret_key,
        }
    }

--- a/libs/proxy/tokio-postgres2/src/connect.rs
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -57,7 +57,6 @@ where
        host: host.clone(),
        port,
        connect_timeout: config.connect_timeout,
-        ssl_mode: config.ssl_mode,
    };

    let (client_tx, conn_rx) = mpsc::unbounded_channel();
@@ -66,6 +65,7 @@ where
        client_tx,
        client_rx,
        socket_config,
+        config.ssl_mode,
        process_id,
        secret_key,
    );
--- a/libs/proxy/tokio-postgres2/src/lib.rs
+++ b/libs/proxy/tokio-postgres2/src/lib.rs
@@ -3,7 +3,7 @@

 use postgres_protocol2::message::backend::ReadyForQueryBody;

-pub use crate::cancel_token::{CancelToken, RawCancelToken};
+pub use crate::cancel_token::CancelToken;
 pub use crate::client::{Client, SocketConfig};
 pub use crate::config::Config;
 pub use crate::connect_raw::RawConnection;
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -10,7 +10,7 @@ use std::sync::Arc;
 use std::time::{Duration, SystemTime};
 use std::{env, io};

-use anyhow::{Context, Result, anyhow};
+use anyhow::{Context, Result};
 use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
 use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions};
 use azure_storage::StorageCredentials;
@@ -37,7 +37,6 @@ use crate::metrics::{AttemptOutcome, RequestKind, start_measuring_requests};
 use crate::{
    ConcurrencyLimiter, Download, DownloadError, DownloadKind, DownloadOpts, Listing, ListingMode,
    ListingObject, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
-    Version, VersionKind,
 };

 pub struct AzureBlobStorage {
@@ -406,39 +405,6 @@ impl AzureBlobStorage {
    pub fn container_name(&self) -> &str {
        &self.container_name
    }
-
-    async fn list_versions_with_permit(
-        &self,
-        _permit: &tokio::sync::SemaphorePermit<'_>,
-        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> Result<crate::VersionListing, DownloadError> {
-        let customize_builder = |mut builder: ListBlobsBuilder| {
-            builder = builder.include_versions(true);
-            // We do not return this info back to `VersionListing` yet.
-            builder = builder.include_deleted(true);
-            builder
-        };
-        let kind = RequestKind::ListVersions;
-
-        let mut stream = std::pin::pin!(self.list_streaming_for_fn(
-            prefix,
-            mode,
-            max_keys,
-            cancel,
-            kind,
-            customize_builder
-        ));
-        let mut combined: crate::VersionListing =
-            stream.next().await.expect("At least one item required")?;
-        while let Some(list) = stream.next().await {
-            let list = list?;
-            combined.versions.extend(list.versions.into_iter());
-        }
-        Ok(combined)
-    }
 }

 trait ListingCollector {
@@ -522,10 +488,27 @@ impl RemoteStorage for AzureBlobStorage {
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
    ) -> std::result::Result<crate::VersionListing, DownloadError> {
+        let customize_builder = |mut builder: ListBlobsBuilder| {
+            builder = builder.include_versions(true);
+            builder
+        };
        let kind = RequestKind::ListVersions;
-        let permit = self.permit(kind, cancel).await?;
-        self.list_versions_with_permit(&permit, prefix, mode, max_keys, cancel)
-            .await
+
+        let mut stream = std::pin::pin!(self.list_streaming_for_fn(
+            prefix,
+            mode,
+            max_keys,
+            cancel,
+            kind,
+            customize_builder
+        ));
+        let mut combined: crate::VersionListing =
+            stream.next().await.expect("At least one item required")?;
+        while let Some(list) = stream.next().await {
+            let list = list?;
+            combined.versions.extend(list.versions.into_iter());
+        }
+        Ok(combined)
    }

    async fn head_object(
@@ -820,158 +803,14 @@ impl RemoteStorage for AzureBlobStorage {

    async fn time_travel_recover(
        &self,
-        prefix: Option<&RemotePath>,
-        timestamp: SystemTime,
-        done_if_after: SystemTime,
-        cancel: &CancellationToken,
+        _prefix: Option<&RemotePath>,
+        _timestamp: SystemTime,
+        _done_if_after: SystemTime,
+        _cancel: &CancellationToken,
    ) -> Result<(), TimeTravelError> {
-        let msg = "PLEASE NOTE: Azure Blob storage time-travel recovery may not work as expected "
-            .to_string()
-            + "for some specific files. If a file gets deleted but then overwritten and we want to recover "
-            + "to the time during the file was not present, this functionality will recover the file. Only "
-            + "use the functionality for services that can tolerate this. For example, recovering a state of the "
-            + "pageserver tenants.";
-        tracing::error!("{}", msg);
-
-        let kind = RequestKind::TimeTravel;
-        let permit = self.permit(kind, cancel).await?;
-
-        let mode = ListingMode::NoDelimiter;
-        let version_listing = self
-            .list_versions_with_permit(&permit, prefix, mode, None, cancel)
-            .await
-            .map_err(|err| match err {
-                DownloadError::Other(e) => TimeTravelError::Other(e),
-                DownloadError::Cancelled => TimeTravelError::Cancelled,
-                other => TimeTravelError::Other(other.into()),
-            })?;
-        let versions_and_deletes = version_listing.versions;
-
-        tracing::info!(
-            "Built list for time travel with {} versions and deletions",
-            versions_and_deletes.len()
-        );
-
-        // Work on the list of references instead of the objects directly,
-        // otherwise we get lifetime errors in the sort_by_key call below.
-        let mut versions_and_deletes = versions_and_deletes.iter().collect::<Vec<_>>();
-
-        versions_and_deletes.sort_by_key(|vd| (&vd.key, &vd.last_modified));
-
-        let mut vds_for_key = HashMap::<_, Vec<_>>::new();
-
-        for vd in &versions_and_deletes {
-            let Version { key, .. } = &vd;
-            let version_id = vd.version_id().map(|v| v.0.as_str());
-            if version_id == Some("null") {
-                return Err(TimeTravelError::Other(anyhow!(
-                    "Received ListVersions response for key={key} with version_id='null', \
-                        indicating either disabled versioning, or legacy objects with null version id values"
-                )));
-            }
-            tracing::trace!("Parsing version key={key} kind={:?}", vd.kind);
-
-            vds_for_key.entry(key).or_default().push(vd);
-        }
-
-        let warn_threshold = 3;
-        let max_retries = 10;
-        let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled);
-
-        for (key, versions) in vds_for_key {
-            let last_vd = versions.last().unwrap();
-            let key = self.relative_path_to_name(key);
-            if last_vd.last_modified > done_if_after {
-                tracing::debug!("Key {key} has version later than done_if_after, skipping");
-                continue;
-            }
-            // the version we want to restore to.
-            let version_to_restore_to =
-                match versions.binary_search_by_key(&timestamp, |tpl| tpl.last_modified) {
-                    Ok(v) => v,
-                    Err(e) => e,
-                };
-            if version_to_restore_to == versions.len() {
-                tracing::debug!("Key {key} has no changes since timestamp, skipping");
-                continue;
-            }
-            let mut do_delete = false;
-            if version_to_restore_to == 0 {
-                // All versions more recent, so the key didn't exist at the specified time point.
-                tracing::debug!(
-                    "All {} versions more recent for {key}, deleting",
-                    versions.len()
-                );
-                do_delete = true;
-            } else {
-                match &versions[version_to_restore_to - 1] {
-                    Version {
-                        kind: VersionKind::Version(version_id),
-                        ..
-                    } => {
-                        let source_url = format!(
-                            "{}/{}?versionid={}",
-                            self.client
-                                .url()
-                                .map_err(|e| TimeTravelError::Other(anyhow!("{e}")))?,
-                            key,
-                            version_id.0
-                        );
-                        tracing::debug!(
-                            "Promoting old version {} for {key} at {}...",
-                            version_id.0,
-                            source_url
-                        );
-                        backoff::retry(
-                            || async {
-                                let blob_client = self.client.blob_client(key.clone());
-                                let op = blob_client.copy(Url::from_str(&source_url).unwrap());
-                                tokio::select! {
-                                    res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
-                                    _ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
-                                }
-                            },
-                            is_permanent,
-                            warn_threshold,
-                            max_retries,
-                            "copying object version for time_travel_recover",
-                            cancel,
-                        )
-                        .await
-                        .ok_or_else(|| TimeTravelError::Cancelled)
-                        .and_then(|x| x)?;
-                        tracing::info!(?version_id, %key, "Copied old version in Azure blob storage");
-                    }
-                    Version {
-                        kind: VersionKind::DeletionMarker,
-                        ..
-                    } => {
-                        do_delete = true;
-                    }
-                }
-            };
-            if do_delete {
-                if matches!(last_vd.kind, VersionKind::DeletionMarker) {
-                    // Key has since been deleted (but there was some history), no need to do anything
-                    tracing::debug!("Key {key} already deleted, skipping.");
-                } else {
-                    tracing::debug!("Deleting {key}...");
-
-                    self.delete(&RemotePath::from_string(&key).unwrap(), cancel)
-                        .await
-                        .map_err(|e| {
-                            // delete_oid0 will use TimeoutOrCancel
-                            if TimeoutOrCancel::caused_by_cancel(&e) {
-                                TimeTravelError::Cancelled
-                            } else {
-                                TimeTravelError::Other(e)
-                            }
-                        })?;
-                }
-            }
-        }
-
-        Ok(())
+        // TODO use Azure point in time recovery feature for this
+        // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
+        Err(TimeTravelError::Unimplemented)
    }
 }

--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -1022,7 +1022,6 @@ impl RemoteStorage for S3Bucket {
            let Version { key, .. } = &vd;
            let version_id = vd.version_id().map(|v| v.0.as_str());
            if version_id == Some("null") {
-                // TODO: check the behavior of using the SDK on a non-versioned container
                return Err(TimeTravelError::Other(anyhow!(
                    "Received ListVersions response for key={key} with version_id='null', \
                    indicating either disabled versioning, or legacy objects with null version id values"
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -573,8 +573,7 @@ fn start_pageserver(
        tokio::sync::mpsc::unbounded_channel();
    let deletion_queue_client = deletion_queue.new_client();
    let background_purges = mgr::BackgroundPurges::default();
-
-    let tenant_manager = mgr::init(
+    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
        background_purges.clone(),
        TenantSharedResources {
@@ -585,10 +584,10 @@ fn start_pageserver(
            basebackup_prepare_sender,
            feature_resolver,
        },
+        order,
        shutdown_pageserver.clone(),
-    );
+    ))?;
    let tenant_manager = Arc::new(tenant_manager);
-    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(tenant_manager.clone(), order))?;

    let basebackup_cache = BasebackupCache::spawn(
        BACKGROUND_RUNTIME.handle(),
--- a/pageserver/src/feature_resolver.rs
+++ b/pageserver/src/feature_resolver.rs
@@ -1,6 +1,5 @@
 use std::{collections::HashMap, sync::Arc, time::Duration};

-use pageserver_api::config::NodeMetadata;
 use posthog_client_lite::{
    CaptureEvent, FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError,
    PostHogFlagFilterPropertyValue,
@@ -87,35 +86,7 @@ impl FeatureResolver {
                        }
                    }
                }
-                // TODO: move this to a background task so that we don't block startup in case of slow disk
-                let metadata_path = conf.metadata_path();
-                match std::fs::read_to_string(&metadata_path) {
-                    Ok(metadata_str) => match serde_json::from_str::<NodeMetadata>(&metadata_str) {
-                        Ok(metadata) => {
-                            properties.insert(
-                                "hostname".to_string(),
-                                PostHogFlagFilterPropertyValue::String(metadata.http_host),
-                            );
-                            if let Some(cplane_region) = metadata.other.get("region_id") {
-                                if let Some(cplane_region) = cplane_region.as_str() {
-                                    // This region contains the cell number
-                                    properties.insert(
-                                        "neon_region".to_string(),
-                                        PostHogFlagFilterPropertyValue::String(
-                                            cplane_region.to_string(),
-                                        ),
-                                    );
-                                }
-                            }
-                        }
-                        Err(e) => {
-                            tracing::warn!("Failed to parse metadata.json: {}", e);
-                        }
-                    },
-                    Err(e) => {
-                        tracing::warn!("Failed to read metadata.json: {}", e);
-                    }
-                }
+                // TODO: add pageserver URL.
                Arc::new(properties)
            };
            let fake_tenants = {
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1053,15 +1053,6 @@ pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("Failed to register pageserver_tenant_states_count metric")
 });

-pub(crate) static TIMELINE_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_timeline_states_count",
-        "Count of timelines per state",
-        &["state"]
-    )
-    .expect("Failed to register pageserver_timeline_states_count metric")
-});
-
 /// A set of broken tenants.
 ///
 /// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
@@ -3334,8 +3325,6 @@ impl TimelineMetrics {
                &timeline_id,
            );

-        TIMELINE_STATE_METRIC.with_label_values(&["active"]).inc();
-
        TimelineMetrics {
            tenant_id,
            shard_id,
@@ -3490,8 +3479,6 @@ impl TimelineMetrics {
            return;
        }

-        TIMELINE_STATE_METRIC.with_label_values(&["active"]).dec();
-
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -89,8 +89,7 @@ use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::{
    BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS,
    INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_OFFLOADED_TIMELINES,
-    TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, TIMELINE_STATE_METRIC,
-    remove_tenant_metrics,
+    TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
 };
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::LocationMode;
@@ -545,28 +544,6 @@ pub struct OffloadedTimeline {

    /// Part of the `OffloadedTimeline` object's lifecycle: this needs to be set before we drop it
    pub deleted_from_ancestor: AtomicBool,
-
-    _metrics_guard: OffloadedTimelineMetricsGuard,
-}
-
-/// Increases the offloaded timeline count metric when created, and decreases when dropped.
-struct OffloadedTimelineMetricsGuard;
-
-impl OffloadedTimelineMetricsGuard {
-    fn new() -> Self {
-        TIMELINE_STATE_METRIC
-            .with_label_values(&["offloaded"])
-            .inc();
-        Self
-    }
-}
-
-impl Drop for OffloadedTimelineMetricsGuard {
-    fn drop(&mut self) {
-        TIMELINE_STATE_METRIC
-            .with_label_values(&["offloaded"])
-            .dec();
-    }
 }

 impl OffloadedTimeline {
@@ -599,8 +576,6 @@ impl OffloadedTimeline {

            delete_progress: timeline.delete_progress.clone(),
            deleted_from_ancestor: AtomicBool::new(false),
-
-            _metrics_guard: OffloadedTimelineMetricsGuard::new(),
        })
    }
    fn from_manifest(tenant_shard_id: TenantShardId, manifest: &OffloadedTimelineManifest) -> Self {
@@ -620,7 +595,6 @@ impl OffloadedTimeline {
            archived_at,
            delete_progress: TimelineDeleteProgress::default(),
            deleted_from_ancestor: AtomicBool::new(false),
-            _metrics_guard: OffloadedTimelineMetricsGuard::new(),
        }
    }
    fn manifest(&self) -> OffloadedTimelineManifest {
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -12,6 +12,7 @@ use anyhow::Context;
 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
+use once_cell::sync::Lazy;
 use pageserver_api::key::Key;
 use pageserver_api::models::{DetachBehavior, LocationConfigMode};
 use pageserver_api::shard::{
@@ -102,7 +103,7 @@ pub(crate) enum TenantsMap {
    /// [`init_tenant_mgr`] is not done yet.
    Initializing,
    /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
-    /// New tenants can be added using [`TenantManager::tenant_map_acquire_slot`].
+    /// New tenants can be added using [`tenant_map_acquire_slot`].
    Open(BTreeMap<TenantShardId, TenantSlot>),
    /// The pageserver has entered shutdown mode via [`TenantManager::shutdown`].
    /// Existing tenants are still accessible, but no new tenants can be created.
@@ -283,6 +284,9 @@ impl BackgroundPurges {
    }
 }

+static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
+    Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));
+
 /// Responsible for storing and mutating the collection of all tenants
 /// that this pageserver has state for.
 ///
@@ -293,7 +297,10 @@ impl BackgroundPurges {
 /// and attached modes concurrently.
 pub struct TenantManager {
    conf: &'static PageServerConf,
-    tenants: std::sync::RwLock<TenantsMap>,
+    // TODO: currently this is a &'static pointing to TENANTs.  When we finish refactoring
+    // out of that static variable, the TenantManager can own this.
+    // See https://github.com/neondatabase/neon/issues/5796
+    tenants: &'static std::sync::RwLock<TenantsMap>,
    resources: TenantSharedResources,

    // Long-running operations that happen outside of a [`Tenant`] lifetime should respect this token.
@@ -472,43 +479,21 @@ pub(crate) enum DeleteTenantError {
    Other(#[from] anyhow::Error),
 }

-/// Initialize repositories at `Initializing` state.
-pub fn init(
-    conf: &'static PageServerConf,
-    background_purges: BackgroundPurges,
-    resources: TenantSharedResources,
-    cancel: CancellationToken,
-) -> TenantManager {
-    TenantManager {
-        conf,
-        tenants: std::sync::RwLock::new(TenantsMap::Initializing),
-        resources,
-        cancel,
-        background_purges,
-    }
-}
-
-/// Transition repositories from `Initializing` state to `Open` state with locally available timelines.
+/// Initialize repositories with locally available timelines.
 /// Timelines that are only partially available locally (remote storage has more data than this pageserver)
 /// are scheduled for download and added to the tenant once download is completed.
 #[instrument(skip_all)]
 pub async fn init_tenant_mgr(
-    tenant_manager: Arc<TenantManager>,
+    conf: &'static PageServerConf,
+    background_purges: BackgroundPurges,
+    resources: TenantSharedResources,
    init_order: InitializationOrder,
-) -> anyhow::Result<()> {
-    debug_assert!(matches!(
-        *tenant_manager.tenants.read().unwrap(),
-        TenantsMap::Initializing
-    ));
+    cancel: CancellationToken,
+) -> anyhow::Result<TenantManager> {
    let mut tenants = BTreeMap::new();

    let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);

-    let conf = tenant_manager.conf;
-    let resources = &tenant_manager.resources;
-    let cancel = &tenant_manager.cancel;
-    let background_purges = &tenant_manager.background_purges;
-
    // Initialize dynamic limits that depend on system resources
    let system_memory =
        sysinfo::System::new_with_specifics(sysinfo::RefreshKind::new().with_memory())
@@ -527,7 +512,7 @@ pub async fn init_tenant_mgr(
    let tenant_configs = init_load_tenant_configs(conf).await;

    // Determine which tenants are to be secondary or attached, and in which generation
-    let tenant_modes = init_load_generations(conf, &tenant_configs, resources, cancel).await?;
+    let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;

    tracing::info!(
        "Attaching {} tenants at startup, warming up {} at a time",
@@ -684,10 +669,18 @@ pub async fn init_tenant_mgr(

    info!("Processed {} local tenants at startup", tenants.len());

-    let mut tenant_map = tenant_manager.tenants.write().unwrap();
-    *tenant_map = TenantsMap::Open(tenants);
+    let mut tenants_map = TENANTS.write().unwrap();
+    assert!(matches!(&*tenants_map, &TenantsMap::Initializing));

-    Ok(())
+    *tenants_map = TenantsMap::Open(tenants);
+
+    Ok(TenantManager {
+        conf,
+        tenants: &TENANTS,
+        resources,
+        cancel: CancellationToken::new(),
+        background_purges,
+    })
 }

 /// Wrapper for Tenant::spawn that checks invariants before running
@@ -726,6 +719,142 @@ fn tenant_spawn(
    )
 }

+async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
+    let mut join_set = JoinSet::new();
+
+    #[cfg(all(debug_assertions, not(test)))]
+    {
+        // Check that our metrics properly tracked the size of the tenants map.  This is a convenient location to check,
+        // as it happens implicitly at the end of tests etc.
+        let m = tenants.read().unwrap();
+        debug_assert_eq!(METRICS.slots_total(), m.len() as u64);
+    }
+
+    // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
+    let (total_in_progress, total_attached) = {
+        let mut m = tenants.write().unwrap();
+        match &mut *m {
+            TenantsMap::Initializing => {
+                *m = TenantsMap::ShuttingDown(BTreeMap::default());
+                info!("tenants map is empty");
+                return;
+            }
+            TenantsMap::Open(tenants) => {
+                let mut shutdown_state = BTreeMap::new();
+                let mut total_in_progress = 0;
+                let mut total_attached = 0;
+
+                for (tenant_shard_id, v) in std::mem::take(tenants).into_iter() {
+                    match v {
+                        TenantSlot::Attached(t) => {
+                            shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone()));
+                            join_set.spawn(
+                                async move {
+                                    let res = {
+                                        let (_guard, shutdown_progress) = completion::channel();
+                                        t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await
+                                    };
+
+                                    if let Err(other_progress) = res {
+                                        // join the another shutdown in progress
+                                        other_progress.wait().await;
+                                    }
+
+                                    // we cannot afford per tenant logging here, because if s3 is degraded, we are
+                                    // going to log too many lines
+                                    debug!("tenant successfully stopped");
+                                }
+                                .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())),
+                            );
+
+                            total_attached += 1;
+                        }
+                        TenantSlot::Secondary(state) => {
+                            // We don't need to wait for this individually per-tenant: the
+                            // downloader task will be waited on eventually, this cancel
+                            // is just to encourage it to drop out if it is doing work
+                            // for this tenant right now.
+                            state.cancel.cancel();
+
+                            shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary(state));
+                        }
+                        TenantSlot::InProgress(notify) => {
+                            // InProgress tenants are not visible in TenantsMap::ShuttingDown: we will
+                            // wait for their notifications to fire in this function.
+                            join_set.spawn(async move {
+                                notify.wait().await;
+                            });
+
+                            total_in_progress += 1;
+                        }
+                    }
+                }
+                *m = TenantsMap::ShuttingDown(shutdown_state);
+                (total_in_progress, total_attached)
+            }
+            TenantsMap::ShuttingDown(_) => {
+                error!(
+                    "already shutting down, this function isn't supposed to be called more than once"
+                );
+                return;
+            }
+        }
+    };
+
+    let started_at = std::time::Instant::now();
+
+    info!(
+        "Waiting for {} InProgress tenants and {} Attached tenants to shut down",
+        total_in_progress, total_attached
+    );
+
+    let total = join_set.len();
+    let mut panicked = 0;
+    let mut buffering = true;
+    const BUFFER_FOR: std::time::Duration = std::time::Duration::from_millis(500);
+    let mut buffered = std::pin::pin!(tokio::time::sleep(BUFFER_FOR));
+
+    while !join_set.is_empty() {
+        tokio::select! {
+            Some(joined) = join_set.join_next() => {
+                match joined {
+                    Ok(()) => {},
+                    Err(join_error) if join_error.is_cancelled() => {
+                        unreachable!("we are not cancelling any of the tasks");
+                    }
+                    Err(join_error) if join_error.is_panic() => {
+                        // cannot really do anything, as this panic is likely a bug
+                        panicked += 1;
+                    }
+                    Err(join_error) => {
+                        warn!("unknown kind of JoinError: {join_error}");
+                    }
+                }
+                if !buffering {
+                    // buffer so that every 500ms since the first update (or starting) we'll log
+                    // how far away we are; this is because we will get SIGKILL'd at 10s, and we
+                    // are not able to log *then*.
+                    buffering = true;
+                    buffered.as_mut().reset(tokio::time::Instant::now() + BUFFER_FOR);
+                }
+            },
+            _ = &mut buffered, if buffering => {
+                buffering = false;
+                info!(remaining = join_set.len(), total, elapsed_ms = started_at.elapsed().as_millis(), "waiting for tenants to shutdown");
+            }
+        }
+    }
+
+    if panicked > 0 {
+        warn!(
+            panicked,
+            total, "observed panicks while shutting down tenants"
+        );
+    }
+
+    // caller will log how long we took
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum UpsertLocationError {
    #[error("Bad config request: {0}")]
@@ -927,8 +1056,7 @@ impl TenantManager {
        // the tenant is inaccessible to the outside world while we are doing this, but that is sensible:
        // the state is ill-defined while we're in transition.  Transitions are async, but fast: we do
        // not do significant I/O, and shutdowns should be prompt via cancellation tokens.
-        let mut slot_guard = self
-            .tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)
+        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)
            .map_err(|e| match e {
                TenantSlotError::NotFound(_) => {
                    unreachable!("Called with mode Any")
@@ -1095,75 +1223,6 @@ impl TenantManager {
        }
    }

-    fn tenant_map_acquire_slot(
-        &self,
-        tenant_shard_id: &TenantShardId,
-        mode: TenantSlotAcquireMode,
-    ) -> Result<SlotGuard, TenantSlotError> {
-        use TenantSlotAcquireMode::*;
-        METRICS.tenant_slot_writes.inc();
-
-        let mut locked = self.tenants.write().unwrap();
-        let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug());
-        let _guard = span.enter();
-
-        let m = match &mut *locked {
-            TenantsMap::Initializing => return Err(TenantMapError::StillInitializing.into()),
-            TenantsMap::ShuttingDown(_) => return Err(TenantMapError::ShuttingDown.into()),
-            TenantsMap::Open(m) => m,
-        };
-
-        use std::collections::btree_map::Entry;
-
-        let entry = m.entry(*tenant_shard_id);
-
-        match entry {
-            Entry::Vacant(v) => match mode {
-                MustExist => {
-                    tracing::debug!("Vacant && MustExist: return NotFound");
-                    Err(TenantSlotError::NotFound(*tenant_shard_id))
-                }
-                _ => {
-                    let (completion, barrier) = utils::completion::channel();
-                    let inserting = TenantSlot::InProgress(barrier);
-                    METRICS.slot_inserted(&inserting);
-                    v.insert(inserting);
-                    tracing::debug!("Vacant, inserted InProgress");
-                    Ok(SlotGuard::new(
-                        *tenant_shard_id,
-                        None,
-                        completion,
-                        &self.tenants,
-                    ))
-                }
-            },
-            Entry::Occupied(mut o) => {
-                // Apply mode-driven checks
-                match (o.get(), mode) {
-                    (TenantSlot::InProgress(_), _) => {
-                        tracing::debug!("Occupied, failing for InProgress");
-                        Err(TenantSlotError::InProgress)
-                    }
-                    _ => {
-                        // Happy case: the slot was not in any state that violated our mode
-                        let (completion, barrier) = utils::completion::channel();
-                        let in_progress = TenantSlot::InProgress(barrier);
-                        METRICS.slot_inserted(&in_progress);
-                        let old_value = o.insert(in_progress);
-                        METRICS.slot_removed(&old_value);
-                        tracing::debug!("Occupied, replaced with InProgress");
-                        Ok(SlotGuard::new(
-                            *tenant_shard_id,
-                            Some(old_value),
-                            completion,
-                            &self.tenants,
-                        ))
-                    }
-                }
-            }
-        }
-    }
-
    /// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
    /// LocationConf that was last used to attach it.  Optionally, the local file cache may be
    /// dropped before re-attaching.
@@ -1180,8 +1239,7 @@ impl TenantManager {
        drop_cache: bool,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        let mut slot_guard =
-            self.tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
+        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
        let Some(old_slot) = slot_guard.get_old_value() else {
            anyhow::bail!("Tenant not found when trying to reset");
        };
@@ -1330,8 +1388,7 @@ impl TenantManager {
            Ok(())
        }

-        let slot_guard =
-            self.tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
+        let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
        match &slot_guard.old_value {
            Some(TenantSlot::Attached(tenant)) => {
                // Legacy deletion flow: the tenant remains attached, goes to Stopping state, and
@@ -1482,7 +1539,7 @@ impl TenantManager {
        // Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
        drop(tenant);
        let mut parent_slot_guard =
-            self.tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
+            tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
        let parent = match parent_slot_guard.get_old_value() {
            Some(TenantSlot::Attached(t)) => t,
            Some(TenantSlot::Secondary(_)) => anyhow::bail!("Tenant location in secondary mode"),
@@ -1786,145 +1843,7 @@ impl TenantManager {
    pub(crate) async fn shutdown(&self) {
        self.cancel.cancel();

-        self.shutdown_all_tenants0().await
-    }
-
-    async fn shutdown_all_tenants0(&self) {
-        let mut join_set = JoinSet::new();
-
-        #[cfg(all(debug_assertions, not(test)))]
-        {
-            // Check that our metrics properly tracked the size of the tenants map.  This is a convenient location to check,
-            // as it happens implicitly at the end of tests etc.
-            let m = self.tenants.read().unwrap();
-            debug_assert_eq!(METRICS.slots_total(), m.len() as u64);
-        }
-
-        // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
-        let (total_in_progress, total_attached) = {
-            let mut m = self.tenants.write().unwrap();
-            match &mut *m {
-                TenantsMap::Initializing => {
-                    *m = TenantsMap::ShuttingDown(BTreeMap::default());
-                    info!("tenants map is empty");
-                    return;
-                }
-                TenantsMap::Open(tenants) => {
-                    let mut shutdown_state = BTreeMap::new();
-                    let mut total_in_progress = 0;
-                    let mut total_attached = 0;
-
-                    for (tenant_shard_id, v) in std::mem::take(tenants).into_iter() {
-                        match v {
-                            TenantSlot::Attached(t) => {
-                                shutdown_state
-                                    .insert(tenant_shard_id, TenantSlot::Attached(t.clone()));
-                                join_set.spawn(
-                                    async move {
-                                        let res = {
-                                            let (_guard, shutdown_progress) = completion::channel();
-                                            t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await
-                                        };
-
-                                        if let Err(other_progress) = res {
-                                            // join the another shutdown in progress
-                                            other_progress.wait().await;
-                                        }
-
-                                        // we cannot afford per tenant logging here, because if s3 is degraded, we are
-                                        // going to log too many lines
-                                        debug!("tenant successfully stopped");
-                                    }
-                                    .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())),
-                                );
-
-                                total_attached += 1;
-                            }
-                            TenantSlot::Secondary(state) => {
-                                // We don't need to wait for this individually per-tenant: the
-                                // downloader task will be waited on eventually, this cancel
-                                // is just to encourage it to drop out if it is doing work
-                                // for this tenant right now.
-                                state.cancel.cancel();
-
-                                shutdown_state
-                                    .insert(tenant_shard_id, TenantSlot::Secondary(state));
-                            }
-                            TenantSlot::InProgress(notify) => {
-                                // InProgress tenants are not visible in TenantsMap::ShuttingDown: we will
-                                // wait for their notifications to fire in this function.
-                                join_set.spawn(async move {
-                                    notify.wait().await;
-                                });
-
-                                total_in_progress += 1;
-                            }
-                        }
-                    }
-                    *m = TenantsMap::ShuttingDown(shutdown_state);
-                    (total_in_progress, total_attached)
-                }
-                TenantsMap::ShuttingDown(_) => {
-                    error!(
-                        "already shutting down, this function isn't supposed to be called more than once"
-                    );
-                    return;
-                }
-            }
-        };
-
-        let started_at = std::time::Instant::now();
-
-        info!(
-            "Waiting for {} InProgress tenants and {} Attached tenants to shut down",
-            total_in_progress, total_attached
-        );
-
-        let total = join_set.len();
-        let mut panicked = 0;
-        let mut buffering = true;
-        const BUFFER_FOR: std::time::Duration = std::time::Duration::from_millis(500);
-        let mut buffered = std::pin::pin!(tokio::time::sleep(BUFFER_FOR));
-
-        while !join_set.is_empty() {
-            tokio::select! {
-                Some(joined) = join_set.join_next() => {
-                    match joined {
-                        Ok(()) => {},
-                        Err(join_error) if join_error.is_cancelled() => {
-                            unreachable!("we are not cancelling any of the tasks");
-                        }
-                        Err(join_error) if join_error.is_panic() => {
-                            // cannot really do anything, as this panic is likely a bug
-                            panicked += 1;
-                        }
-                        Err(join_error) => {
-                            warn!("unknown kind of JoinError: {join_error}");
-                        }
-                    }
-                    if !buffering {
-                        // buffer so that every 500ms since the first update (or starting) we'll log
-                        // how far away we are; this is because we will get SIGKILL'd at 10s, and we
-                        // are not able to log *then*.
-                        buffering = true;
-                        buffered.as_mut().reset(tokio::time::Instant::now() + BUFFER_FOR);
-                    }
-                },
-                _ = &mut buffered, if buffering => {
-                    buffering = false;
-                    info!(remaining = join_set.len(), total, elapsed_ms = started_at.elapsed().as_millis(), "waiting for tenants to shutdown");
-                }
-            }
-        }
-
-        if panicked > 0 {
-            warn!(
-                panicked,
-                total, "observed panicks while shutting down tenants"
-            );
-        }
-
-        // caller will log how long we took
+        shutdown_all_tenants0(self.tenants).await
    }

    /// Detaches a tenant, and removes its local files asynchronously.
@@ -1970,12 +1889,12 @@ impl TenantManager {
                .map(Some)
        };

-        let mut removal_result = self
-            .remove_tenant_from_memory(
-                tenant_shard_id,
-                tenant_dir_rename_operation(tenant_shard_id),
-            )
-            .await;
+        let mut removal_result = remove_tenant_from_memory(
+            self.tenants,
+            tenant_shard_id,
+            tenant_dir_rename_operation(tenant_shard_id),
+        )
+        .await;

        // If the tenant was not found, it was likely already removed. Attempt to remove the tenant
        // directory on disk anyway. For example, during shard splits, we shut down and remove the
@@ -2029,16 +1948,17 @@ impl TenantManager {
    ) -> Result<HashSet<TimelineId>, detach_ancestor::Error> {
        use detach_ancestor::Error;

-        let slot_guard = self
-            .tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)
-            .map_err(|e| {
-                use TenantSlotError::*;
+        let slot_guard =
+            tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist).map_err(
+                |e| {
+                    use TenantSlotError::*;

-                match e {
-                    MapState(TenantMapError::ShuttingDown) => Error::ShuttingDown,
-                    NotFound(_) | InProgress | MapState(_) => Error::DetachReparent(e.into()),
-                }
-            })?;
+                    match e {
+                        MapState(TenantMapError::ShuttingDown) => Error::ShuttingDown,
+                        NotFound(_) | InProgress | MapState(_) => Error::DetachReparent(e.into()),
+                    }
+                },
+            )?;

        let tenant = {
            let old_slot = slot_guard
@@ -2371,80 +2291,6 @@ impl TenantManager {
            other => ApiError::InternalServerError(anyhow::anyhow!(other)),
        })
    }
-
-    /// Stops and removes the tenant from memory, if it's not [`TenantState::Stopping`] already, bails otherwise.
-    /// Allows to remove other tenant resources manually, via `tenant_cleanup`.
-    /// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal
-    async fn remove_tenant_from_memory<V, F>(
-        &self,
-        tenant_shard_id: TenantShardId,
-        tenant_cleanup: F,
-    ) -> Result<V, TenantStateError>
-    where
-        F: std::future::Future<Output = anyhow::Result<V>>,
-    {
-        let mut slot_guard =
-            self.tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
-
-        // allow pageserver shutdown to await for our completion
-        let (_guard, progress) = completion::channel();
-
-        // The SlotGuard allows us to manipulate the Tenant object without fear of some
-        // concurrent API request doing something else for the same tenant ID.
-        let attached_tenant = match slot_guard.get_old_value() {
-            Some(TenantSlot::Attached(tenant)) => {
-                // whenever we remove a tenant from memory, we don't want to flush and wait for upload
-                let shutdown_mode = ShutdownMode::Hard;
-
-                // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
-                // that we can continue safely to cleanup.
-                match tenant.shutdown(progress, shutdown_mode).await {
-                    Ok(()) => {}
-                    Err(_other) => {
-                        // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
-                        // wait for it but return an error right away because these are distinct requests.
-                        slot_guard.revert();
-                        return Err(TenantStateError::IsStopping(tenant_shard_id));
-                    }
-                }
-                Some(tenant)
-            }
-            Some(TenantSlot::Secondary(secondary_state)) => {
-                tracing::info!("Shutting down in secondary mode");
-                secondary_state.shutdown().await;
-                None
-            }
-            Some(TenantSlot::InProgress(_)) => {
-                // Acquiring a slot guarantees its old value was not InProgress
-                unreachable!();
-            }
-            None => None,
-        };
-
-        match tenant_cleanup
-            .await
-            .with_context(|| format!("Failed to run cleanup for tenant {tenant_shard_id}"))
-        {
-            Ok(hook_value) => {
-                // Success: drop the old TenantSlot::Attached.
-                slot_guard
-                    .drop_old_value()
-                    .expect("We just called shutdown");
-
-                Ok(hook_value)
-            }
-            Err(e) => {
-                // If we had a Tenant, set it to Broken and put it back in the TenantsMap
-                if let Some(attached_tenant) = attached_tenant {
-                    attached_tenant.set_broken(e.to_string()).await;
-                }
-                // Leave the broken tenant in the map
-                slot_guard.revert();
-
-                Err(TenantStateError::Other(e))
-            }
-        }
-    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -2609,7 +2455,7 @@ pub(crate) enum TenantMapError {
 /// this tenant to retry later, or wait for the InProgress state to end.
 ///
 /// This structure enforces the important invariant that we do not have overlapping
-/// tasks that will try to use local storage for a the same tenant ID: we enforce that
+/// tasks that will try use local storage for a the same tenant ID: we enforce that
 /// the previous contents of a slot have been shut down before the slot can be
 /// left empty or used for something else
 ///
@@ -2622,7 +2468,7 @@ pub(crate) enum TenantMapError {
 /// The `old_value` may be dropped before the SlotGuard is dropped, by calling
 /// `drop_old_value`.  It is an error to call this without shutting down
 /// the conents of `old_value`.
-pub(crate) struct SlotGuard<'a> {
+pub(crate) struct SlotGuard {
    tenant_shard_id: TenantShardId,
    old_value: Option<TenantSlot>,
    upserted: bool,
@@ -2630,23 +2476,19 @@ pub(crate) struct SlotGuard<'a> {
    /// [`TenantSlot::InProgress`] carries the corresponding Barrier: it will
    /// release any waiters as soon as this SlotGuard is dropped.
    completion: utils::completion::Completion,
-
-    tenants: &'a std::sync::RwLock<TenantsMap>,
 }

-impl<'a> SlotGuard<'a> {
+impl SlotGuard {
    fn new(
        tenant_shard_id: TenantShardId,
        old_value: Option<TenantSlot>,
        completion: utils::completion::Completion,
-        tenants: &'a std::sync::RwLock<TenantsMap>,
    ) -> Self {
        Self {
            tenant_shard_id,
            old_value,
            upserted: false,
            completion,
-            tenants,
        }
    }

@@ -2670,8 +2512,8 @@ impl<'a> SlotGuard<'a> {
            ));
        }

-        let replaced: Option<TenantSlot> = {
-            let mut locked = self.tenants.write().unwrap();
+        let replaced = {
+            let mut locked = TENANTS.write().unwrap();

            if let TenantSlot::InProgress(_) = new_value {
                // It is never expected to try and upsert InProgress via this path: it should
@@ -2779,7 +2621,7 @@ impl<'a> SlotGuard<'a> {
    }
 }

-impl<'a> Drop for SlotGuard<'a> {
+impl Drop for SlotGuard {
    fn drop(&mut self) {
        if self.upserted {
            return;
@@ -2787,7 +2629,7 @@ impl<'a> Drop for SlotGuard<'a> {
        // Our old value is already shutdown, or it never existed: it is safe
        // for us to fully release the TenantSlot back into an empty state

-        let mut locked = self.tenants.write().unwrap();
+        let mut locked = TENANTS.write().unwrap();

        let m = match &mut *locked {
            TenantsMap::Initializing => {
@@ -2869,6 +2711,151 @@ enum TenantSlotAcquireMode {
    MustExist,
 }

+fn tenant_map_acquire_slot(
+    tenant_shard_id: &TenantShardId,
+    mode: TenantSlotAcquireMode,
+) -> Result<SlotGuard, TenantSlotError> {
+    tenant_map_acquire_slot_impl(tenant_shard_id, &TENANTS, mode)
+}
+
+fn tenant_map_acquire_slot_impl(
+    tenant_shard_id: &TenantShardId,
+    tenants: &std::sync::RwLock<TenantsMap>,
+    mode: TenantSlotAcquireMode,
+) -> Result<SlotGuard, TenantSlotError> {
+    use TenantSlotAcquireMode::*;
+    METRICS.tenant_slot_writes.inc();
+
+    let mut locked = tenants.write().unwrap();
+    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug());
+    let _guard = span.enter();
+
+    let m = match &mut *locked {
+        TenantsMap::Initializing => return Err(TenantMapError::StillInitializing.into()),
+        TenantsMap::ShuttingDown(_) => return Err(TenantMapError::ShuttingDown.into()),
+        TenantsMap::Open(m) => m,
+    };
+
+    use std::collections::btree_map::Entry;
+
+    let entry = m.entry(*tenant_shard_id);
+
+    match entry {
+        Entry::Vacant(v) => match mode {
+            MustExist => {
+                tracing::debug!("Vacant && MustExist: return NotFound");
+                Err(TenantSlotError::NotFound(*tenant_shard_id))
+            }
+            _ => {
+                let (completion, barrier) = utils::completion::channel();
+                let inserting = TenantSlot::InProgress(barrier);
+                METRICS.slot_inserted(&inserting);
+                v.insert(inserting);
+                tracing::debug!("Vacant, inserted InProgress");
+                Ok(SlotGuard::new(*tenant_shard_id, None, completion))
+            }
+        },
+        Entry::Occupied(mut o) => {
+            // Apply mode-driven checks
+            match (o.get(), mode) {
+                (TenantSlot::InProgress(_), _) => {
+                    tracing::debug!("Occupied, failing for InProgress");
+                    Err(TenantSlotError::InProgress)
+                }
+                _ => {
+                    // Happy case: the slot was not in any state that violated our mode
+                    let (completion, barrier) = utils::completion::channel();
+                    let in_progress = TenantSlot::InProgress(barrier);
+                    METRICS.slot_inserted(&in_progress);
+                    let old_value = o.insert(in_progress);
+                    METRICS.slot_removed(&old_value);
+                    tracing::debug!("Occupied, replaced with InProgress");
+                    Ok(SlotGuard::new(
+                        *tenant_shard_id,
+                        Some(old_value),
+                        completion,
+                    ))
+                }
+            }
+        }
+    }
+}
+
+/// Stops and removes the tenant from memory, if it's not [`TenantState::Stopping`] already, bails otherwise.
+/// Allows to remove other tenant resources manually, via `tenant_cleanup`.
+/// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal
+/// operation would be needed to remove it.
+async fn remove_tenant_from_memory<V, F>(
+    tenants: &std::sync::RwLock<TenantsMap>,
+    tenant_shard_id: TenantShardId,
+    tenant_cleanup: F,
+) -> Result<V, TenantStateError>
+where
+    F: std::future::Future<Output = anyhow::Result<V>>,
+{
+    let mut slot_guard =
+        tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;
+
+    // allow pageserver shutdown to await for our completion
+    let (_guard, progress) = completion::channel();
+
+    // The SlotGuard allows us to manipulate the Tenant object without fear of some
+    // concurrent API request doing something else for the same tenant ID.
+    let attached_tenant = match slot_guard.get_old_value() {
+        Some(TenantSlot::Attached(tenant)) => {
+            // whenever we remove a tenant from memory, we don't want to flush and wait for upload
+            let shutdown_mode = ShutdownMode::Hard;
+
+            // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
+            // that we can continue safely to cleanup.
+            match tenant.shutdown(progress, shutdown_mode).await {
+                Ok(()) => {}
+                Err(_other) => {
+                    // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
+                    // wait for it but return an error right away because these are distinct requests.
+                    slot_guard.revert();
+                    return Err(TenantStateError::IsStopping(tenant_shard_id));
+                }
+            }
+            Some(tenant)
+        }
+        Some(TenantSlot::Secondary(secondary_state)) => {
+            tracing::info!("Shutting down in secondary mode");
+            secondary_state.shutdown().await;
+            None
+        }
+        Some(TenantSlot::InProgress(_)) => {
+            // Acquiring a slot guarantees its old value was not InProgress
+            unreachable!();
+        }
+        None => None,
+    };
+
+    match tenant_cleanup
+        .await
+        .with_context(|| format!("Failed to run cleanup for tenant {tenant_shard_id}"))
+    {
+        Ok(hook_value) => {
+            // Success: drop the old TenantSlot::Attached.
+            slot_guard
+                .drop_old_value()
+                .expect("We just called shutdown");
+
+            Ok(hook_value)
+        }
+        Err(e) => {
+            // If we had a Tenant, set it to Broken and put it back in the TenantsMap
+            if let Some(attached_tenant) = attached_tenant {
+                attached_tenant.set_broken(e.to_string()).await;
+            }
+            // Leave the broken tenant in the map
+            slot_guard.revert();
+
+            Err(TenantStateError::Other(e))
+        }
+    }
+}
+
 use http_utils::error::ApiError;
 use pageserver_api::models::TimelineGcRequest;

@@ -2879,15 +2866,11 @@ mod tests {
    use std::collections::BTreeMap;
    use std::sync::Arc;

-    use storage_broker::BrokerClientChannel;
    use tracing::Instrument;

    use super::super::harness::TenantHarness;
    use super::TenantsMap;
-    use crate::tenant::{
-        TenantSharedResources,
-        mgr::{BackgroundPurges, TenantManager, TenantSlot},
-    };
+    use crate::tenant::mgr::TenantSlot;

    #[tokio::test(start_paused = true)]
    async fn shutdown_awaits_in_progress_tenant() {
@@ -2908,47 +2891,23 @@ mod tests {
        let _e = span.enter();

        let tenants = BTreeMap::from([(id, TenantSlot::Attached(t.clone()))]);
+        let tenants = Arc::new(std::sync::RwLock::new(TenantsMap::Open(tenants)));

        // Invoke remove_tenant_from_memory with a cleanup hook that blocks until we manually
        // permit it to proceed: that will stick the tenant in InProgress

-        let (basebackup_prepare_sender, _) = tokio::sync::mpsc::unbounded_channel::<
-            crate::basebackup_cache::BasebackupPrepareRequest,
-        >();
-
-        let tenant_manager = TenantManager {
-            tenants: std::sync::RwLock::new(TenantsMap::Open(tenants)),
-            conf: h.conf,
-            resources: TenantSharedResources {
-                broker_client: BrokerClientChannel::connect_lazy("foobar.com")
-                    .await
-                    .unwrap(),
-                remote_storage: h.remote_storage.clone(),
-                deletion_queue_client: h.deletion_queue.new_client(),
-                l0_flush_global_state: crate::l0_flush::L0FlushGlobalState::new(
-                    h.conf.l0_flush.clone(),
-                ),
-                basebackup_prepare_sender,
-                feature_resolver: crate::feature_resolver::FeatureResolver::new_disabled(),
-            },
-            cancel: tokio_util::sync::CancellationToken::new(),
-            background_purges: BackgroundPurges::default(),
-        };
-
-        let tenant_manager = Arc::new(tenant_manager);
-
        let (until_cleanup_completed, can_complete_cleanup) = utils::completion::channel();
        let (until_cleanup_started, cleanup_started) = utils::completion::channel();
        let mut remove_tenant_from_memory_task = {
-            let tenant_manager = tenant_manager.clone();
            let jh = tokio::spawn({
+                let tenants = tenants.clone();
                async move {
                    let cleanup = async move {
                        drop(until_cleanup_started);
                        can_complete_cleanup.wait().await;
                        anyhow::Ok(())
                    };
-                    tenant_manager.remove_tenant_from_memory(id, cleanup).await
+                    super::remove_tenant_from_memory(&tenants, id, cleanup).await
                }
                .instrument(h.span())
            });
@@ -2961,11 +2920,9 @@ mod tests {
        let mut shutdown_task = {
            let (until_shutdown_started, shutdown_started) = utils::completion::channel();

-            let tenant_manager = tenant_manager.clone();
-
            let shutdown_task = tokio::spawn(async move {
                drop(until_shutdown_started);
-                tenant_manager.shutdown_all_tenants0().await;
+                super::shutdown_all_tenants0(&tenants).await;
            });

            shutdown_started.wait().await;
--- a/pgxn/neon/communicator.c
+++ b/pgxn/neon/communicator.c
@@ -1092,15 +1092,13 @@ communicator_prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
 		   MyPState->ring_last <= ring_index);
 }

-/* Internal version. Returns the ring index of the last block (result of this function is used only
-*  when nblocks==1)
-*/
+/* internal version. Returns the ring index */
 static uint64
 prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
 						  BlockNumber nblocks, const bits8 *mask,
 						  bool is_prefetch)
 {
-	uint64		last_ring_index;
+	uint64		min_ring_index;
 	PrefetchRequest hashkey;
 #ifdef USE_ASSERT_CHECKING
 	bool		any_hits = false;
@@ -1124,12 +1122,13 @@ Retry:
 		MyPState->ring_unused - MyPState->ring_receive;
 	MyNeonCounters->getpage_prefetches_buffered =
 		MyPState->n_responses_buffered;
-	last_ring_index = UINT64_MAX;

+	min_ring_index = UINT64_MAX;
 	for (int i = 0; i < nblocks; i++)
 	{
 		PrefetchRequest *slot = NULL;
 		PrfHashEntry *entry = NULL;
+		uint64		ring_index;
 		neon_request_lsns *lsns;

 		if (PointerIsValid(mask) && BITMAP_ISSET(mask, i))
@@ -1153,12 +1152,12 @@ Retry:
 		if (entry != NULL)
 		{
 			slot = entry->slot;
-			last_ring_index = slot->my_ring_index;
-			Assert(slot == GetPrfSlot(last_ring_index));
+			ring_index = slot->my_ring_index;
+			Assert(slot == GetPrfSlot(ring_index));

 			Assert(slot->status != PRFS_UNUSED);
-			Assert(MyPState->ring_last <= last_ring_index &&
-				   last_ring_index < MyPState->ring_unused);
+			Assert(MyPState->ring_last <= ring_index &&
+				   ring_index < MyPState->ring_unused);
 			Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag));

 			/*
@@ -1170,9 +1169,9 @@ Retry:
 				if (!neon_prefetch_response_usable(lsns, slot))
 				{
 					/* Wait for the old request to finish and discard it */
-					if (!prefetch_wait_for(last_ring_index))
+					if (!prefetch_wait_for(ring_index))
 						goto Retry;
-					prefetch_set_unused(last_ring_index);
+					prefetch_set_unused(ring_index);
 					entry = NULL;
 					slot = NULL;
 					pgBufferUsage.prefetch.expired += 1;
@@ -1189,12 +1188,13 @@ Retry:
 				 */
 				if (slot->status == PRFS_TAG_REMAINS)
 				{
-					prefetch_set_unused(last_ring_index);
+					prefetch_set_unused(ring_index);
 					entry = NULL;
 					slot = NULL;
 				}
 				else
 				{
+					min_ring_index = Min(min_ring_index, ring_index);
 					/* The buffered request is good enough, return that index */
 					if (is_prefetch)
 						pgBufferUsage.prefetch.duplicates++;
@@ -1283,12 +1283,12 @@ Retry:
 		 * The next buffer pointed to by `ring_unused` is now definitely empty, so
 		 * we can insert the new request to it.
 		 */
-		last_ring_index = MyPState->ring_unused;
+		ring_index = MyPState->ring_unused;

-		Assert(MyPState->ring_last <= last_ring_index &&
-			   last_ring_index <= MyPState->ring_unused);
+		Assert(MyPState->ring_last <= ring_index &&
+			   ring_index <= MyPState->ring_unused);

-		slot = GetPrfSlotNoCheck(last_ring_index);
+		slot = GetPrfSlotNoCheck(ring_index);

 		Assert(slot->status == PRFS_UNUSED);

@@ -1298,9 +1298,11 @@ Retry:
 		 */
 		slot->buftag = hashkey.buftag;
 		slot->shard_no = get_shard_number(&tag);
-		slot->my_ring_index = last_ring_index;
+		slot->my_ring_index = ring_index;
 		slot->flags = 0;

+		min_ring_index = Min(min_ring_index, ring_index);
+
 		if (is_prefetch)
 			MyNeonCounters->getpage_prefetch_requests_total++;
 		else
@@ -1313,12 +1315,11 @@ Retry:
 		MyPState->ring_unused - MyPState->ring_receive;

 	Assert(any_hits);
-	Assert(last_ring_index != UINT64_MAX);

-	Assert(GetPrfSlot(last_ring_index)->status == PRFS_REQUESTED ||
-		   GetPrfSlot(last_ring_index)->status == PRFS_RECEIVED);
-	Assert(MyPState->ring_last <= last_ring_index &&
-		   last_ring_index < MyPState->ring_unused);
+	Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED ||
+		   GetPrfSlot(min_ring_index)->status == PRFS_RECEIVED);
+	Assert(MyPState->ring_last <= min_ring_index &&
+		   min_ring_index < MyPState->ring_unused);

 	if (flush_every_n_requests > 0 &&
 		MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
@@ -1334,7 +1335,7 @@ Retry:
 		MyPState->ring_flush = MyPState->ring_unused;
 	}

-	return last_ring_index;
+	return min_ring_index;
 }

 static bool
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1135,7 +1135,7 @@ VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInf
 				wp->propTermStartLsn = sk->voteResponse.flushLsn;
 				wp->donor = sk;
 			}
-			wp->truncateLsn = Max(sk->voteResponse.truncateLsn, wp->truncateLsn);
+			wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);

 			if (n_votes > 0)
 				appendStringInfoString(s, ", ");
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -14,9 +14,9 @@ use crate::context::RequestContext;
 use crate::control_plane::client::cplane_proxy_v1;
 use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
 use crate::error::{ReportableError, UserFacingError};
+use crate::pglb::connect_compute::ComputeConnectBackend;
 use crate::pqproto::BeMessage;
 use crate::proxy::NeonOptions;
-use crate::proxy::wake_compute::WakeComputeBackend;
 use crate::stream::PqStream;
 use crate::types::RoleName;
 use crate::{auth, compute, waiters};
@@ -109,7 +109,7 @@ impl ConsoleRedirectBackend {
 pub struct ConsoleRedirectNodeInfo(pub(super) NodeInfo);

 #[async_trait]
-impl WakeComputeBackend for ConsoleRedirectNodeInfo {
+impl ComputeConnectBackend for ConsoleRedirectNodeInfo {
    async fn wake_compute(
        &self,
        _ctx: &RequestContext,
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -14,21 +14,20 @@ use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{debug, info};

-use crate::auth::{self, ComputeUserInfoMaybeEndpoint, validate_password_and_exchange};
+use crate::auth::{self, AuthError, ComputeUserInfoMaybeEndpoint, validate_password_and_exchange};
 use crate::cache::Cached;
 use crate::config::AuthenticationConfig;
 use crate::context::RequestContext;
 use crate::control_plane::client::ControlPlaneClient;
 use crate::control_plane::errors::GetAuthInfoError;
-use crate::control_plane::messages::EndpointRateLimitConfig;
 use crate::control_plane::{
    self, AccessBlockerFlags, AuthSecret, CachedNodeInfo, ControlPlaneApi, EndpointAccessControl,
    RoleAccessControl,
 };
 use crate::intern::EndpointIdInt;
+use crate::pglb::connect_compute::ComputeConnectBackend;
 use crate::pqproto::BeMessage;
 use crate::proxy::NeonOptions;
-use crate::proxy::wake_compute::WakeComputeBackend;
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::stream::Stream;
 use crate::types::{EndpointCacheKey, EndpointId, RoleName};
@@ -231,8 +230,11 @@ async fn auth_quirks(
        config.is_vpc_acccess_proxy,
    )?;

-    access_controls.connection_attempt_rate_limit(ctx, &info.endpoint, &endpoint_rate_limiter)?;
-
+    let endpoint = EndpointIdInt::from(&info.endpoint);
+    let rate_limit_config = None;
+    if !endpoint_rate_limiter.check(endpoint, rate_limit_config, 1) {
+        return Err(AuthError::too_many_connections());
+    }
    let role_access = api
        .get_role_access_control(ctx, &info.endpoint, &info.user)
        .await?;
@@ -399,20 +401,19 @@ impl Backend<'_, ComputeUserInfo> {
                allowed_ips: Arc::new(vec![]),
                allowed_vpce: Arc::new(vec![]),
                flags: AccessBlockerFlags::default(),
-                rate_limits: EndpointRateLimitConfig::default(),
            }),
        }
    }
 }

 #[async_trait::async_trait]
-impl WakeComputeBackend for Backend<'_, ComputeUserInfo> {
+impl ComputeConnectBackend for Backend<'_, ComputeCredentials> {
    async fn wake_compute(
        &self,
        ctx: &RequestContext,
    ) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
        match self {
-            Self::ControlPlane(api, info) => api.wake_compute(ctx, info).await,
+            Self::ControlPlane(api, creds) => api.wake_compute(ctx, &creds.info).await,
            Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
        }
    }
@@ -438,7 +439,6 @@ mod tests {
    use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern};
    use crate::config::AuthenticationConfig;
    use crate::context::RequestContext;
-    use crate::control_plane::messages::EndpointRateLimitConfig;
    use crate::control_plane::{
        self, AccessBlockerFlags, CachedNodeInfo, EndpointAccessControl, RoleAccessControl,
    };
@@ -477,7 +477,6 @@ mod tests {
                allowed_ips: Arc::new(self.ips.clone()),
                allowed_vpce: Arc::new(self.vpc_endpoint_ids.clone()),
                flags: self.access_blocker_flags,
-                rate_limits: EndpointRateLimitConfig::default(),
            })
        }

--- a/proxy/src/batch.rs
+++ b/proxy/src/batch.rs
@@ -1,146 +0,0 @@
-//! Batch processing system based on intrusive linked lists.
-//!
-//! Enqueuing a batch job requires no allocations, with
-//! direct support for cancelling jobs early.
-use std::collections::BTreeMap;
-use std::pin::pin;
-use std::sync::Mutex;
-
-use futures::future::Either;
-use scopeguard::ScopeGuard;
-use tokio::sync::oneshot::error::TryRecvError;
-
-use crate::ext::LockExt;
-
-pub trait QueueProcessing: Send + 'static {
-    type Req: Send + 'static;
-    type Res: Send;
-
-    /// Get the desired batch size.
-    fn batch_size(&self, queue_size: usize) -> usize;
-
-    /// This applies a full batch of events.
-    /// Must respond with a full batch of replies.
-    ///
-    /// If this apply can error, it's expected that errors be forwarded to each Self::Res.
-    ///
-    /// Batching does not need to happen atomically.
-    fn apply(&mut self, req: Vec<Self::Req>) -> impl Future<Output = Vec<Self::Res>> + Send;
-}
-
-pub struct BatchQueue<P: QueueProcessing> {
-    processor: tokio::sync::Mutex<P>,
-    inner: Mutex<BatchQueueInner<P>>,
-}
-
-struct BatchJob<P: QueueProcessing> {
-    req: P::Req,
-    res: tokio::sync::oneshot::Sender<P::Res>,
-}
-
-impl<P: QueueProcessing> BatchQueue<P> {
-    pub fn new(p: P) -> Self {
-        Self {
-            processor: tokio::sync::Mutex::new(p),
-            inner: Mutex::new(BatchQueueInner {
-                version: 0,
-                queue: BTreeMap::new(),
-            }),
-        }
-    }
-
-    pub async fn call(&self, req: P::Req) -> P::Res {
-        let (id, mut rx) = self.inner.lock_propagate_poison().register_job(req);
-        let guard = scopeguard::guard(id, move |id| {
-            let mut inner = self.inner.lock_propagate_poison();
-            if inner.queue.remove(&id).is_some() {
-                tracing::debug!("batched task cancelled before completion");
-            }
-        });
-
-        let resp = loop {
-            // try become the leader, or try wait for success.
-            let mut processor = match futures::future::select(rx, pin!(self.processor.lock())).await
-            {
-                // we got the resp.
-                Either::Left((resp, _)) => break resp.ok(),
-                // we are the leader.
-                Either::Right((p, rx_)) => {
-                    rx = rx_;
-                    p
-                }
-            };
-
-            let (reqs, resps) = self.inner.lock_propagate_poison().get_batch(&processor);
-
-            // apply a batch.
-            let values = processor.apply(reqs).await;
-
-            // send response values.
-            for (tx, value) in std::iter::zip(resps, values) {
-                // sender hung up but that's fine.
-                drop(tx.send(value));
-            }
-
-            match rx.try_recv() {
-                Ok(resp) => break Some(resp),
-                Err(TryRecvError::Closed) => break None,
-                // edge case - there was a race condition where
-                // we became the leader but were not in the batch.
-                //
-                // Example:
-                // thread 1: register job id=1
-                // thread 2: register job id=2
-                // thread 2: processor.lock().await
-                // thread 1: processor.lock().await
-                // thread 2: becomes leader, batch_size=1, jobs=[1].
-                Err(TryRecvError::Empty) => {}
-            }
-        };
-
-        // already removed.
-        ScopeGuard::into_inner(guard);
-
-        resp.expect("no response found. batch processer should not panic")
-    }
-}
-
-struct BatchQueueInner<P: QueueProcessing> {
-    version: u64,
-    queue: BTreeMap<u64, BatchJob<P>>,
-}
-
-impl<P: QueueProcessing> BatchQueueInner<P> {
-    fn register_job(&mut self, req: P::Req) -> (u64, tokio::sync::oneshot::Receiver<P::Res>) {
-        let (tx, rx) = tokio::sync::oneshot::channel();
-
-        let id = self.version;
-
-        // Overflow concern:
-        // This is a u64, and we might enqueue 2^16 tasks per second.
-        // This gives us 2^48 seconds (9 million years).
-        // Even if this does overflow, it will not break, but some
-        // jobs with the higher version might never get prioritised.
-        self.version += 1;
-
-        self.queue.insert(id, BatchJob { req, res: tx });
-
-        (id, rx)
-    }
-
-    fn get_batch(&mut self, p: &P) -> (Vec<P::Req>, Vec<tokio::sync::oneshot::Sender<P::Res>>) {
-        let batch_size = p.batch_size(self.queue.len());
-        let mut reqs = Vec::with_capacity(batch_size);
-        let mut resps = Vec::with_capacity(batch_size);
-
-        while reqs.len() < batch_size {
-            let Some((_, job)) = self.queue.pop_first() else {
-                break;
-            };
-            reqs.push(job.req);
-            resps.push(job.res);
-        }
-
-        (reqs, resps)
-    }
-}
--- a/proxy/src/binary/local_proxy.rs
+++ b/proxy/src/binary/local_proxy.rs
@@ -201,7 +201,7 @@ pub async fn run() -> anyhow::Result<()> {
        auth_backend,
        http_listener,
        shutdown.clone(),
-        Arc::new(CancellationHandler::new()),
+        Arc::new(CancellationHandler::new(&config.connect_to_compute, None)),
        endpoint_rate_limiter,
    );

--- a/proxy/src/binary/pg_sni_router.rs
+++ b/proxy/src/binary/pg_sni_router.rs
@@ -28,9 +28,10 @@ use crate::context::RequestContext;
 use crate::metrics::{Metrics, ThreadPoolMetrics};
 use crate::pqproto::FeStartupPacket;
 use crate::protocol2::ConnectionInfo;
-use crate::proxy::{ErrorSource, TlsRequired, copy_bidirectional_client_compute};
+use crate::proxy::{
+    ErrorSource, TlsRequired, copy_bidirectional_client_compute, run_until_cancelled,
+};
 use crate::stream::{PqStream, Stream};
-use crate::util::run_until_cancelled;

 project_git_version!(GIT_VERSION);

--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -21,8 +21,7 @@ use utils::{project_build_tag, project_git_version};

 use crate::auth::backend::jwt::JwkCache;
 use crate::auth::backend::{ConsoleRedirectBackend, MaybeOwned};
-use crate::batch::BatchQueue;
-use crate::cancellation::{CancellationHandler, CancellationProcessor};
+use crate::cancellation::{CancellationHandler, handle_cancel_messages};
 use crate::config::{
    self, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, ProjectInfoCacheOptions,
    ProxyConfig, ProxyProtocolV2, remote_storage_from_toml,
@@ -391,7 +390,13 @@ pub async fn run() -> anyhow::Result<()> {
        .as_ref()
        .map(|redis_publisher| RedisKVClient::new(redis_publisher.clone(), redis_rps_limit));

-    let cancellation_handler = Arc::new(CancellationHandler::new());
+    // channel size should be higher than redis client limit to avoid blocking
+    let cancel_ch_size = args.cancellation_ch_size;
+    let (tx_cancel, rx_cancel) = tokio::sync::mpsc::channel(cancel_ch_size);
+    let cancellation_handler = Arc::new(CancellationHandler::new(
+        &config.connect_to_compute,
+        Some(tx_cancel),
+    ));

    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
        RateBucketInfo::to_leaky_bucket(&args.endpoint_rps_limit)
@@ -518,10 +523,14 @@ pub async fn run() -> anyhow::Result<()> {
            if let Some(mut redis_kv_client) = redis_kv_client {
                maintenance_tasks.spawn(async move {
                    redis_kv_client.try_connect().await?;
-                    cancellation_handler.init_tx(BatchQueue::new(CancellationProcessor {
-                        client: redis_kv_client,
-                        batch_size: args.cancellation_batch_size,
-                    }));
+                    handle_cancel_messages(
+                        &mut redis_kv_client,
+                        rx_cancel,
+                        args.cancellation_batch_size,
+                    )
+                    .await?;
+
+                    drop(redis_kv_client);

                    // `handle_cancel_messages` was terminated due to the tx_cancel
                    // being dropped. this is not worthy of an error, and this task can only return `Err`,
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -364,7 +364,6 @@ mod tests {
    use std::sync::Arc;

    use super::*;
-    use crate::control_plane::messages::EndpointRateLimitConfig;
    use crate::control_plane::{AccessBlockerFlags, AuthSecret};
    use crate::scram::ServerSecret;
    use crate::types::ProjectId;
@@ -400,7 +399,6 @@ mod tests {
                allowed_ips: allowed_ips.clone(),
                allowed_vpce: Arc::new(vec![]),
                flags: AccessBlockerFlags::default(),
-                rate_limits: EndpointRateLimitConfig::default(),
            },
            RoleAccessControl {
                secret: secret1.clone(),
@@ -416,7 +414,6 @@ mod tests {
                allowed_ips: allowed_ips.clone(),
                allowed_vpce: Arc::new(vec![]),
                flags: AccessBlockerFlags::default(),
-                rate_limits: EndpointRateLimitConfig::default(),
            },
            RoleAccessControl {
                secret: secret2.clone(),
@@ -442,7 +439,6 @@ mod tests {
                allowed_ips: allowed_ips.clone(),
                allowed_vpce: Arc::new(vec![]),
                flags: AccessBlockerFlags::default(),
-                rate_limits: EndpointRateLimitConfig::default(),
            },
            RoleAccessControl {
                secret: secret3.clone(),
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,22 +1,20 @@
-use std::convert::Infallible;
 use std::net::{IpAddr, SocketAddr};
-use std::sync::{Arc, OnceLock};
-use std::time::Duration;
+use std::sync::Arc;

-use anyhow::anyhow;
-use futures::FutureExt;
+use anyhow::{Context, anyhow};
 use ipnet::{IpNet, Ipv4Net, Ipv6Net};
-use postgres_client::RawCancelToken;
+use postgres_client::CancelToken;
+use postgres_client::tls::MakeTlsConnect;
 use redis::{Cmd, FromRedisValue, Value};
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tokio::net::TcpStream;
-use tokio::time::timeout;
-use tracing::{debug, error, info};
+use tokio::sync::{mpsc, oneshot};
+use tracing::{debug, error, info, warn};

 use crate::auth::AuthError;
 use crate::auth::backend::ComputeUserInfo;
-use crate::batch::{BatchQueue, QueueProcessing};
+use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::control_plane::ControlPlaneApi;
 use crate::error::ReportableError;
@@ -29,36 +27,46 @@ use crate::redis::kv_ops::RedisKVClient;

 type IpSubnetKey = IpNet;

-const CANCEL_KEY_TTL: std::time::Duration = std::time::Duration::from_secs(600);
-const CANCEL_KEY_REFRESH: std::time::Duration = std::time::Duration::from_secs(570);
+const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time

 // Message types for sending through mpsc channel
 pub enum CancelKeyOp {
    StoreCancelKey {
-        key: CancelKeyData,
-        value: Box<str>,
-        expire: std::time::Duration,
+        key: String,
+        field: String,
+        value: String,
+        resp_tx: Option<oneshot::Sender<anyhow::Result<()>>>,
+        _guard: CancelChannelSizeGuard<'static>,
+        expire: i64, // TTL for key
    },
    GetCancelData {
-        key: CancelKeyData,
+        key: String,
+        resp_tx: oneshot::Sender<anyhow::Result<Vec<(String, String)>>>,
+        _guard: CancelChannelSizeGuard<'static>,
+    },
+    RemoveCancelKey {
+        key: String,
+        field: String,
+        resp_tx: Option<oneshot::Sender<anyhow::Result<()>>>,
+        _guard: CancelChannelSizeGuard<'static>,
    },
 }

 pub struct Pipeline {
    inner: redis::Pipeline,
-    replies: usize,
+    replies: Vec<CancelReplyOp>,
 }

 impl Pipeline {
    fn with_capacity(n: usize) -> Self {
        Self {
            inner: redis::Pipeline::with_capacity(n),
-            replies: 0,
+            replies: Vec::with_capacity(n),
        }
    }

-    async fn execute(self, client: &mut RedisKVClient) -> Vec<anyhow::Result<Value>> {
-        let responses = self.replies;
+    async fn execute(&mut self, client: &mut RedisKVClient) {
+        let responses = self.replies.len();
        let batch_size = self.inner.len();

        match client.query(&self.inner).await {
@@ -68,73 +76,176 @@ impl Pipeline {
                    batch_size,
                    responses, "successfully completed cancellation jobs",
                );
-                values.into_iter().map(Ok).collect()
+                for (value, reply) in std::iter::zip(values, self.replies.drain(..)) {
+                    reply.send_value(value);
+                }
            }
            Ok(value) => {
                error!(batch_size, ?value, "unexpected redis return value");
-                std::iter::repeat_with(|| Err(anyhow!("incorrect response type from redis")))
-                    .take(responses)
-                    .collect()
+                for reply in self.replies.drain(..) {
+                    reply.send_err(anyhow!("incorrect response type from redis"));
+                }
            }
            Err(err) => {
-                std::iter::repeat_with(|| Err(anyhow!("could not send cmd to redis: {err}")))
-                    .take(responses)
-                    .collect()
+                for reply in self.replies.drain(..) {
+                    reply.send_err(anyhow!("could not send cmd to redis: {err}"));
+                }
            }
        }
+
+        self.inner.clear();
+        self.replies.clear();
    }

-    fn add_command_with_reply(&mut self, cmd: Cmd) {
+    fn add_command_with_reply(&mut self, cmd: Cmd, reply: CancelReplyOp) {
        self.inner.add_command(cmd);
-        self.replies += 1;
+        self.replies.push(reply);
    }

    fn add_command_no_reply(&mut self, cmd: Cmd) {
        self.inner.add_command(cmd).ignore();
    }
+
+    fn add_command(&mut self, cmd: Cmd, reply: Option<CancelReplyOp>) {
+        match reply {
+            Some(reply) => self.add_command_with_reply(cmd, reply),
+            None => self.add_command_no_reply(cmd),
+        }
+    }
 }

 impl CancelKeyOp {
-    fn register(&self, pipe: &mut Pipeline) {
+    fn register(self, pipe: &mut Pipeline) {
        #[allow(clippy::used_underscore_binding)]
        match self {
-            CancelKeyOp::StoreCancelKey { key, value, expire } => {
-                let key = KeyPrefix::Cancel(*key).build_redis_key();
-                pipe.add_command_with_reply(Cmd::hset(&key, "data", &**value));
-                pipe.add_command_no_reply(Cmd::expire(&key, expire.as_secs() as i64));
+            CancelKeyOp::StoreCancelKey {
+                key,
+                field,
+                value,
+                resp_tx,
+                _guard,
+                expire,
+            } => {
+                let reply =
+                    resp_tx.map(|resp_tx| CancelReplyOp::StoreCancelKey { resp_tx, _guard });
+                pipe.add_command(Cmd::hset(&key, field, value), reply);
+                pipe.add_command_no_reply(Cmd::expire(key, expire));
            }
-            CancelKeyOp::GetCancelData { key } => {
-                let key = KeyPrefix::Cancel(*key).build_redis_key();
-                pipe.add_command_with_reply(Cmd::hget(key, "data"));
+            CancelKeyOp::GetCancelData {
+                key,
+                resp_tx,
+                _guard,
+            } => {
+                let reply = CancelReplyOp::GetCancelData { resp_tx, _guard };
+                pipe.add_command_with_reply(Cmd::hgetall(key), reply);
+            }
+            CancelKeyOp::RemoveCancelKey {
+                key,
+                field,
+                resp_tx,
+                _guard,
+            } => {
+                let reply =
+                    resp_tx.map(|resp_tx| CancelReplyOp::RemoveCancelKey { resp_tx, _guard });
+                pipe.add_command(Cmd::hdel(key, field), reply);
            }
        }
    }
 }

-pub struct CancellationProcessor {
-    pub client: RedisKVClient,
-    pub batch_size: usize,
+// Message types for sending through mpsc channel
+pub enum CancelReplyOp {
+    StoreCancelKey {
+        resp_tx: oneshot::Sender<anyhow::Result<()>>,
+        _guard: CancelChannelSizeGuard<'static>,
+    },
+    GetCancelData {
+        resp_tx: oneshot::Sender<anyhow::Result<Vec<(String, String)>>>,
+        _guard: CancelChannelSizeGuard<'static>,
+    },
+    RemoveCancelKey {
+        resp_tx: oneshot::Sender<anyhow::Result<()>>,
+        _guard: CancelChannelSizeGuard<'static>,
+    },
 }

-impl QueueProcessing for CancellationProcessor {
-    type Req = (CancelChannelSizeGuard<'static>, CancelKeyOp);
-    type Res = anyhow::Result<redis::Value>;
-
-    fn batch_size(&self, _queue_size: usize) -> usize {
-        self.batch_size
+impl CancelReplyOp {
+    fn send_err(self, e: anyhow::Error) {
+        match self {
+            CancelReplyOp::StoreCancelKey { resp_tx, _guard } => {
+                resp_tx
+                    .send(Err(e))
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+            CancelReplyOp::GetCancelData { resp_tx, _guard } => {
+                resp_tx
+                    .send(Err(e))
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+            CancelReplyOp::RemoveCancelKey { resp_tx, _guard } => {
+                resp_tx
+                    .send(Err(e))
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+        }
    }

-    async fn apply(&mut self, batch: Vec<Self::Req>) -> Vec<Self::Res> {
-        let mut pipeline = Pipeline::with_capacity(batch.len());
+    fn send_value(self, v: redis::Value) {
+        match self {
+            CancelReplyOp::StoreCancelKey { resp_tx, _guard } => {
+                let send =
+                    FromRedisValue::from_owned_redis_value(v).context("could not parse value");
+                resp_tx
+                    .send(send)
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+            CancelReplyOp::GetCancelData { resp_tx, _guard } => {
+                let send =
+                    FromRedisValue::from_owned_redis_value(v).context("could not parse value");
+                resp_tx
+                    .send(send)
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+            CancelReplyOp::RemoveCancelKey { resp_tx, _guard } => {
+                let send =
+                    FromRedisValue::from_owned_redis_value(v).context("could not parse value");
+                resp_tx
+                    .send(send)
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+        }
+    }
+}
+
+// Running as a separate task to accept messages through the rx channel
+pub async fn handle_cancel_messages(
+    client: &mut RedisKVClient,
+    mut rx: mpsc::Receiver<CancelKeyOp>,
+    batch_size: usize,
+) -> anyhow::Result<()> {
+    let mut batch = Vec::with_capacity(batch_size);
+    let mut pipeline = Pipeline::with_capacity(batch_size);
+
+    loop {
+        if rx.recv_many(&mut batch, batch_size).await == 0 {
+            warn!("shutting down cancellation queue");
+            break Ok(());
+        }

        let batch_size = batch.len();
        debug!(batch_size, "running cancellation jobs");

-        for (_, op) in &batch {
-            op.register(&mut pipeline);
+        for msg in batch.drain(..) {
+            msg.register(&mut pipeline);
        }

-        pipeline.execute(&mut self.client).await
+        pipeline.execute(client).await;
    }
 }

@@ -142,9 +253,10 @@ impl QueueProcessing for CancellationProcessor {
 ///
 /// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances.
 pub struct CancellationHandler {
+    compute_config: &'static ComputeConfig,
    // rate limiter of cancellation requests
    limiter: Arc<std::sync::Mutex<LeakyBucketRateLimiter<IpSubnetKey>>>,
-    tx: OnceLock<BatchQueue<CancellationProcessor>>, // send messages to the redis KV client task
+    tx: Option<mpsc::Sender<CancelKeyOp>>, // send messages to the redis KV client task
 }

 #[derive(Debug, Error)]
@@ -184,9 +296,13 @@ impl ReportableError for CancelError {
 }

 impl CancellationHandler {
-    pub fn new() -> Self {
+    pub fn new(
+        compute_config: &'static ComputeConfig,
+        tx: Option<mpsc::Sender<CancelKeyOp>>,
+    ) -> Self {
        Self {
-            tx: OnceLock::new(),
+            compute_config,
+            tx,
            limiter: Arc::new(std::sync::Mutex::new(
                LeakyBucketRateLimiter::<IpSubnetKey>::new_with_shards(
                    LeakyBucketRateLimiter::<IpSubnetKey>::DEFAULT,
@@ -196,14 +312,7 @@ impl CancellationHandler {
        }
    }

-    pub fn init_tx(&self, queue: BatchQueue<CancellationProcessor>) {
-        self.tx
-            .set(queue)
-            .map_err(|_| {})
-            .expect("cancellation queue should be registered once");
-    }
-
-    pub(crate) fn get_key(self: Arc<Self>) -> Session {
+    pub(crate) fn get_key(self: &Arc<Self>) -> Session {
        // we intentionally generate a random "backend pid" and "secret key" here.
        // we use the corresponding u64 as an identifier for the
        // actual endpoint+pid+secret for postgres/pgbouncer.
@@ -213,10 +322,14 @@ impl CancellationHandler {

        let key: CancelKeyData = rand::random();

+        let prefix_key: KeyPrefix = KeyPrefix::Cancel(key);
+        let redis_key = prefix_key.build_redis_key();
+
        debug!("registered new query cancellation key {key}");
        Session {
            key,
-            cancellation_handler: self,
+            redis_key,
+            cancellation_handler: Arc::clone(self),
        }
    }

@@ -224,43 +337,62 @@ impl CancellationHandler {
        &self,
        key: CancelKeyData,
    ) -> Result<Option<CancelClosure>, CancelError> {
-        let guard = Metrics::get()
-            .proxy
-            .cancel_channel_size
-            .guard(RedisMsgKind::HGet);
-        let op = CancelKeyOp::GetCancelData { key };
+        let prefix_key: KeyPrefix = KeyPrefix::Cancel(key);
+        let redis_key = prefix_key.build_redis_key();

-        let Some(tx) = self.tx.get() else {
+        let (resp_tx, resp_rx) = tokio::sync::oneshot::channel();
+        let op = CancelKeyOp::GetCancelData {
+            key: redis_key,
+            resp_tx,
+            _guard: Metrics::get()
+                .proxy
+                .cancel_channel_size
+                .guard(RedisMsgKind::HGetAll),
+        };
+
+        let Some(tx) = &self.tx else {
            tracing::warn!("cancellation handler is not available");
            return Err(CancelError::InternalError);
        };

-        const TIMEOUT: Duration = Duration::from_secs(5);
-        let result = timeout(TIMEOUT, tx.call((guard, op)))
-            .await
-            .map_err(|_| {
-                tracing::warn!("timed out waiting to receive GetCancelData response");
-                CancelError::RateLimit
-            })?
+        tx.try_send(op)
            .map_err(|e| {
-                tracing::warn!("failed to receive GetCancelData response: {e}");
-                CancelError::InternalError
-            })?;
+                tracing::warn!("failed to send GetCancelData for {key}: {e}");
+            })
+            .map_err(|()| CancelError::InternalError)?;

-        let cancel_state_str = String::from_owned_redis_value(result).map_err(|e| {
+        let result = resp_rx.await.map_err(|e| {
            tracing::warn!("failed to receive GetCancelData response: {e}");
            CancelError::InternalError
        })?;

-        let cancel_closure: CancelClosure =
-            serde_json::from_str(&cancel_state_str).map_err(|e| {
-                tracing::warn!("failed to deserialize cancel state: {e}");
-                CancelError::InternalError
-            })?;
+        let cancel_state_str: Option<String> = match result {
+            Ok(mut state) => {
+                if state.len() == 1 {
+                    Some(state.remove(0).1)
+                } else {
+                    tracing::warn!("unexpected number of entries in cancel state: {state:?}");
+                    return Err(CancelError::InternalError);
+                }
+            }
+            Err(e) => {
+                tracing::warn!("failed to receive cancel state from redis: {e}");
+                return Err(CancelError::InternalError);
+            }
+        };

-        Ok(Some(cancel_closure))
+        let cancel_state: Option<CancelClosure> = match cancel_state_str {
+            Some(state) => {
+                let cancel_closure: CancelClosure = serde_json::from_str(&state).map_err(|e| {
+                    tracing::warn!("failed to deserialize cancel state: {e}");
+                    CancelError::InternalError
+                })?;
+                Some(cancel_closure)
+            }
+            None => None,
+        };
+        Ok(cancel_state)
    }
-
    /// Try to cancel a running query for the corresponding connection.
    /// If the cancellation key is not found, it will be published to Redis.
    /// check_allowed - if true, check if the IP is allowed to cancel the query.
@@ -328,17 +460,17 @@ impl CancellationHandler {
                kind: crate::metrics::CancellationOutcome::Found,
            });
        info!("cancelling query per user's request using key {key}");
-        cancel_closure.try_cancel_query().await
+        cancel_closure.try_cancel_query(self.compute_config).await
    }
 }

 /// This should've been a [`std::future::Future`], but
 /// it's impossible to name a type of an unboxed future
 /// (we'd need something like `#![feature(type_alias_impl_trait)]`).
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct CancelClosure {
    socket_addr: SocketAddr,
-    cancel_token: RawCancelToken,
+    cancel_token: CancelToken,
    hostname: String, // for pg_sni router
    user_info: ComputeUserInfo,
 }
@@ -346,7 +478,7 @@ pub struct CancelClosure {
 impl CancelClosure {
    pub(crate) fn new(
        socket_addr: SocketAddr,
-        cancel_token: RawCancelToken,
+        cancel_token: CancelToken,
        hostname: String,
        user_info: ComputeUserInfo,
    ) -> Self {
@@ -358,9 +490,19 @@ impl CancelClosure {
        }
    }
    /// Cancels the query running on user's compute node.
-    pub(crate) async fn try_cancel_query(&self) -> Result<(), CancelError> {
+    pub(crate) async fn try_cancel_query(
+        self,
+        compute_config: &ComputeConfig,
+    ) -> Result<(), CancelError> {
        let socket = TcpStream::connect(self.socket_addr).await?;
-        self.cancel_token.cancel_query_raw(socket).await?;
+
+        let tls = <_ as MakeTlsConnect<tokio::net::TcpStream>>::make_tls_connect(
+            compute_config,
+            &self.hostname,
+        )
+        .map_err(|e| CancelError::IO(std::io::Error::other(e.to_string())))?;
+
+        self.cancel_token.cancel_query_raw(socket, tls).await?;
        debug!("query was cancelled");
        Ok(())
    }
@@ -370,6 +512,7 @@ impl CancelClosure {
 pub(crate) struct Session {
    /// The user-facing key identifying this session.
    key: CancelKeyData,
+    redis_key: String,
    cancellation_handler: Arc<CancellationHandler>,
 }

@@ -378,61 +521,60 @@ impl Session {
        &self.key
    }

-    /// Ensure the cancel key is continously refreshed,
-    /// but stop when the channel is dropped.
-    pub(crate) async fn maintain_cancel_key(
+    // Send the store key op to the cancellation handler and set TTL for the key
+    pub(crate) fn write_cancel_key(
        &self,
-        session_id: uuid::Uuid,
-        cancel: tokio::sync::oneshot::Receiver<Infallible>,
-        cancel_closure: &CancelClosure,
-    ) {
-        futures::future::select(
-            std::pin::pin!(self.maintain_redis_cancel_key(cancel_closure)),
-            cancel,
-        )
-        .await;
-
-        if let Err(err) = cancel_closure.try_cancel_query().boxed().await {
-            tracing::warn!(
-                ?session_id,
-                ?err,
-                "could not cancel the query in the database"
-            );
-        }
-    }
-
-    // Ensure the cancel key is continously refreshed.
-    async fn maintain_redis_cancel_key(&self, cancel_closure: &CancelClosure) -> ! {
-        let Some(tx) = self.cancellation_handler.tx.get() else {
+        cancel_closure: CancelClosure,
+    ) -> Result<(), CancelError> {
+        let Some(tx) = &self.cancellation_handler.tx else {
            tracing::warn!("cancellation handler is not available");
-            // don't exit, as we only want to exit if cancelled externally.
-            std::future::pending().await
+            return Err(CancelError::InternalError);
        };

-        let closure_json = serde_json::to_string(&cancel_closure)
-            .expect("serialising to json string should not fail")
-            .into_boxed_str();
+        let closure_json = serde_json::to_string(&cancel_closure).map_err(|e| {
+            tracing::warn!("failed to serialize cancel closure: {e}");
+            CancelError::InternalError
+        })?;

-        loop {
-            let guard = Metrics::get()
+        let op = CancelKeyOp::StoreCancelKey {
+            key: self.redis_key.clone(),
+            field: "data".to_string(),
+            value: closure_json,
+            resp_tx: None,
+            _guard: Metrics::get()
                .proxy
                .cancel_channel_size
-                .guard(RedisMsgKind::HSet);
-            let op = CancelKeyOp::StoreCancelKey {
-                key: self.key,
-                value: closure_json.clone(),
-                expire: CANCEL_KEY_TTL,
-            };
+                .guard(RedisMsgKind::HSet),
+            expire: CANCEL_KEY_TTL,
+        };

-            tracing::debug!(
-                src=%self.key,
-                dest=?cancel_closure.cancel_token,
-                "registering cancellation key"
-            );
+        let _ = tx.try_send(op).map_err(|e| {
+            let key = self.key;
+            tracing::warn!("failed to send StoreCancelKey for {key}: {e}");
+        });
+        Ok(())
+    }

-            if tx.call((guard, op)).await.is_ok() {
-                tokio::time::sleep(CANCEL_KEY_REFRESH).await;
-            }
-        }
+    pub(crate) fn remove_cancel_key(&self) -> Result<(), CancelError> {
+        let Some(tx) = &self.cancellation_handler.tx else {
+            tracing::warn!("cancellation handler is not available");
+            return Err(CancelError::InternalError);
+        };
+
+        let op = CancelKeyOp::RemoveCancelKey {
+            key: self.redis_key.clone(),
+            field: "data".to_string(),
+            resp_tx: None,
+            _guard: Metrics::get()
+                .proxy
+                .cancel_channel_size
+                .guard(RedisMsgKind::HDel),
+        };
+
+        let _ = tx.try_send(op).map_err(|e| {
+            let key = self.key;
+            tracing::warn!("failed to send RemoveCancelKey for {key}: {e}");
+        });
+        Ok(())
    }
 }
--- a/proxy/src/compute/mod.rs
+++ b/proxy/src/compute/mod.rs
@@ -9,7 +9,7 @@ use itertools::Itertools;
 use postgres_client::config::{AuthKeys, SslMode};
 use postgres_client::maybe_tls_stream::MaybeTlsStream;
 use postgres_client::tls::MakeTlsConnect;
-use postgres_client::{NoTls, RawCancelToken, RawConnection};
+use postgres_client::{CancelToken, NoTls, RawConnection};
 use postgres_protocol::message::backend::NoticeResponseBody;
 use thiserror::Error;
 use tokio::net::{TcpStream, lookup_host};
@@ -136,11 +136,11 @@ impl AuthInfo {
        }
    }

-    pub(crate) fn with_auth_keys(keys: ComputeCredentialKeys) -> Self {
+    pub(crate) fn with_auth_keys(keys: &ComputeCredentialKeys) -> Self {
        Self {
            auth: match keys {
                ComputeCredentialKeys::AuthKeys(AuthKeys::ScramSha256(auth_keys)) => {
-                    Some(Auth::Scram(Box::new(auth_keys)))
+                    Some(Auth::Scram(Box::new(*auth_keys)))
                }
                ComputeCredentialKeys::JwtPayload(_) | ComputeCredentialKeys::None => None,
            },
@@ -265,8 +265,7 @@ impl ConnectInfo {
    }
 }

-pub type RustlsStream = <ComputeConfig as MakeTlsConnect<tokio::net::TcpStream>>::Stream;
-pub type MaybeRustlsStream = MaybeTlsStream<tokio::net::TcpStream, RustlsStream>;
+type RustlsStream = <ComputeConfig as MakeTlsConnect<tokio::net::TcpStream>>::Stream;

 pub(crate) struct PostgresConnection {
    /// Socket connected to a compute node.
@@ -280,7 +279,7 @@ pub(crate) struct PostgresConnection {
    /// Notices received from compute after authenticating
    pub(crate) delayed_notice: Vec<NoticeResponseBody>,

-    pub(crate) guage: NumDbConnectionsGuard<'static>,
+    _guage: NumDbConnectionsGuard<'static>,
 }

 impl ConnectInfo {
@@ -328,7 +327,9 @@ impl ConnectInfo {
        // Yet another reason to rework the connection establishing code.
        let cancel_closure = CancelClosure::new(
            socket_addr,
-            RawCancelToken {
+            CancelToken {
+                socket_config: None,
+                ssl_mode: self.ssl_mode,
                process_id,
                secret_key,
            },
@@ -342,7 +343,7 @@ impl ConnectInfo {
            delayed_notice,
            cancel_closure,
            aux,
-            guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()),
+            _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()),
        };

        Ok(connection)
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -11,12 +11,13 @@ use crate::config::{ProxyConfig, ProxyProtocolV2};
 use crate::context::RequestContext;
 use crate::error::ReportableError;
 use crate::metrics::{Metrics, NumClientConnectionsGuard};
+use crate::pglb::connect_compute::{TcpMechanism, connect_to_compute};
 use crate::pglb::handshake::{HandshakeData, handshake};
 use crate::pglb::passthrough::ProxyPassthrough;
 use crate::protocol2::{ConnectHeader, ConnectionInfo, read_proxy_protocol};
-use crate::proxy::connect_compute::{TcpMechanism, connect_to_compute};
-use crate::proxy::{ClientRequestError, ErrorSource, prepare_client_connection};
-use crate::util::run_until_cancelled;
+use crate::proxy::{
+    ClientRequestError, ErrorSource, prepare_client_connection, run_until_cancelled,
+};

 pub async fn task_main(
    config: &'static ProxyConfig,
@@ -120,7 +121,7 @@ pub async fn task_main(
                Ok(Some(p)) => {
                    ctx.set_success();
                    let _disconnect = ctx.log_connect();
-                    match p.proxy_pass().await {
+                    match p.proxy_pass(&config.connect_to_compute).await {
                        Ok(()) => {}
                        Err(ErrorSource::Client(e)) => {
                            error!(
@@ -232,30 +233,22 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
    .or_else(|e| async { Err(stream.throw_error(e, Some(ctx)).await) })
    .await?;

-    let session = cancellation_handler.get_key();
+    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
+    let session = cancellation_handler_clone.get_key();
+
+    session.write_cancel_key(node.cancel_closure.clone())?;

    prepare_client_connection(&node, *session.key(), &mut stream);
    let stream = stream.flush_and_into_inner().await?;

-    let session_id = ctx.session_id();
-    let (cancel_on_shutdown, cancel) = tokio::sync::oneshot::channel();
-    tokio::spawn(async move {
-        session
-            .maintain_cancel_key(session_id, cancel, &node.cancel_closure)
-            .await;
-    });
-
    Ok(Some(ProxyPassthrough {
        client: stream,
-        compute: node.stream,
-
-        aux: node.aux,
+        aux: node.aux.clone(),
        private_link_id: None,
-
-        _cancel_on_shutdown: cancel_on_shutdown,
-
+        compute: node,
+        session_id: ctx.session_id(),
+        cancel: session,
        _req: request_gauge,
        _conn: conn_gauge,
-        _db_conn: node.guage,
    }))
 }
--- a/proxy/src/control_plane/client/cplane_proxy_v1.rs
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -146,7 +146,6 @@ impl NeonControlPlaneClient {
                    public_access_blocked: block_public_connections,
                    vpc_access_blocked: block_vpc_connections,
                },
-                rate_limits: body.rate_limits,
            })
        }
        .inspect_err(|e| tracing::debug!(error = ?e))
@@ -313,7 +312,6 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
            allowed_ips: Arc::new(auth_info.allowed_ips),
            allowed_vpce: Arc::new(auth_info.allowed_vpc_endpoint_ids),
            flags: auth_info.access_blocker_flags,
-            rate_limits: auth_info.rate_limits,
        };
        let role_control = RoleAccessControl {
            secret: auth_info.secret,
@@ -359,7 +357,6 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
            allowed_ips: Arc::new(auth_info.allowed_ips),
            allowed_vpce: Arc::new(auth_info.allowed_vpc_endpoint_ids),
            flags: auth_info.access_blocker_flags,
-            rate_limits: auth_info.rate_limits,
        };
        let role_control = RoleAccessControl {
            secret: auth_info.secret,
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -20,7 +20,7 @@ use crate::context::RequestContext;
 use crate::control_plane::errors::{
    ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
 };
-use crate::control_plane::messages::{EndpointRateLimitConfig, MetricsAuxInfo};
+use crate::control_plane::messages::MetricsAuxInfo;
 use crate::control_plane::{
    AccessBlockerFlags, AuthInfo, AuthSecret, CachedNodeInfo, EndpointAccessControl, NodeInfo,
    RoleAccessControl,
@@ -130,7 +130,6 @@ impl MockControlPlane {
            project_id: None,
            account_id: None,
            access_blocker_flags: AccessBlockerFlags::default(),
-            rate_limits: EndpointRateLimitConfig::default(),
        })
    }

@@ -234,7 +233,6 @@ impl super::ControlPlaneApi for MockControlPlane {
            allowed_ips: Arc::new(info.allowed_ips),
            allowed_vpce: Arc::new(info.allowed_vpc_endpoint_ids),
            flags: info.access_blocker_flags,
-            rate_limits: info.rate_limits,
        })
    }

--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -10,7 +10,6 @@ use clashmap::ClashMap;
 use tokio::time::Instant;
 use tracing::{debug, info};

-use super::{EndpointAccessControl, RoleAccessControl};
 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError};
 use crate::cache::endpoints::EndpointsCache;
@@ -23,6 +22,8 @@ use crate::metrics::ApiLockMetrics;
 use crate::rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token};
 use crate::types::EndpointId;

+use super::{EndpointAccessControl, RoleAccessControl};
+
 #[non_exhaustive]
 #[derive(Clone)]
 pub enum ControlPlaneClient {
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -227,35 +227,12 @@ pub(crate) struct UserFacingMessage {
 #[derive(Deserialize)]
 pub(crate) struct GetEndpointAccessControl {
    pub(crate) role_secret: Box<str>,
-
-    pub(crate) project_id: Option<ProjectIdInt>,
-    pub(crate) account_id: Option<AccountIdInt>,
-
    pub(crate) allowed_ips: Option<Vec<IpPattern>>,
    pub(crate) allowed_vpc_endpoint_ids: Option<Vec<String>>,
+    pub(crate) project_id: Option<ProjectIdInt>,
+    pub(crate) account_id: Option<AccountIdInt>,
    pub(crate) block_public_connections: Option<bool>,
    pub(crate) block_vpc_connections: Option<bool>,
-
-    #[serde(default)]
-    pub(crate) rate_limits: EndpointRateLimitConfig,
-}
-
-#[derive(Copy, Clone, Deserialize, Default)]
-pub struct EndpointRateLimitConfig {
-    pub connection_attempts: ConnectionAttemptsLimit,
-}
-
-#[derive(Copy, Clone, Deserialize, Default)]
-pub struct ConnectionAttemptsLimit {
-    pub tcp: Option<LeakyBucketSetting>,
-    pub ws: Option<LeakyBucketSetting>,
-    pub http: Option<LeakyBucketSetting>,
-}
-
-#[derive(Copy, Clone, Deserialize)]
-pub struct LeakyBucketSetting {
-    pub rps: f64,
-    pub burst: f64,
 }

 /// Response which holds compute node's `host:port` pair.
--- a/proxy/src/control_plane/mod.rs
+++ b/proxy/src/control_plane/mod.rs
@@ -11,8 +11,6 @@ pub(crate) mod errors;

 use std::sync::Arc;

-use messages::EndpointRateLimitConfig;
-
 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::backend::jwt::AuthRule;
 use crate::auth::{AuthError, IpPattern, check_peer_addr_is_in_list};
@@ -20,9 +18,8 @@ use crate::cache::{Cached, TimedLru};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo};
-use crate::intern::{AccountIdInt, EndpointIdInt, ProjectIdInt};
+use crate::intern::{AccountIdInt, ProjectIdInt};
 use crate::protocol2::ConnectionInfoExtra;
-use crate::rate_limiter::{EndpointRateLimiter, LeakyBucketConfig};
 use crate::types::{EndpointCacheKey, EndpointId, RoleName};
 use crate::{compute, scram};

@@ -59,8 +56,6 @@ pub(crate) struct AuthInfo {
    pub(crate) account_id: Option<AccountIdInt>,
    /// Are public connections or VPC connections blocked?
    pub(crate) access_blocker_flags: AccessBlockerFlags,
-    /// The rate limits for this endpoint.
-    pub(crate) rate_limits: EndpointRateLimitConfig,
 }

 /// Info for establishing a connection to a compute node.
@@ -106,8 +101,6 @@ pub struct EndpointAccessControl {
    pub allowed_ips: Arc<Vec<IpPattern>>,
    pub allowed_vpce: Arc<Vec<String>>,
    pub flags: AccessBlockerFlags,
-
-    pub rate_limits: EndpointRateLimitConfig,
 }

 impl EndpointAccessControl {
@@ -146,36 +139,6 @@ impl EndpointAccessControl {

        Ok(())
    }
-
-    pub fn connection_attempt_rate_limit(
-        &self,
-        ctx: &RequestContext,
-        endpoint: &EndpointId,
-        rate_limiter: &EndpointRateLimiter,
-    ) -> Result<(), AuthError> {
-        let endpoint = EndpointIdInt::from(endpoint);
-
-        let limits = &self.rate_limits.connection_attempts;
-        let config = match ctx.protocol() {
-            crate::metrics::Protocol::Http => limits.http,
-            crate::metrics::Protocol::Ws => limits.ws,
-            crate::metrics::Protocol::Tcp => limits.tcp,
-            crate::metrics::Protocol::SniRouter => return Ok(()),
-        };
-        let config = config.and_then(|config| {
-            if config.rps <= 0.0 || config.burst <= 0.0 {
-                return None;
-            }
-
-            Some(LeakyBucketConfig::new(config.rps, config.burst))
-        });
-
-        if !rate_limiter.check(endpoint, config, 1) {
-            return Err(AuthError::too_many_connections());
-        }
-
-        Ok(())
-    }
 }

 /// This will allocate per each call, but the http requests alone
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -75,7 +75,6 @@
 pub mod binary;

 mod auth;
-mod batch;
 mod cache;
 mod cancellation;
 mod compute;
@@ -107,5 +106,4 @@ mod tls;
 mod types;
 mod url;
 mod usage_metrics;
-mod util;
 mod waiters;
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -8,19 +8,19 @@ use crate::config::{ComputeConfig, RetryConfig};
 use crate::context::RequestContext;
 use crate::control_plane::errors::WakeComputeError;
 use crate::control_plane::locks::ApiLocks;
-use crate::control_plane::{self, NodeInfo};
+use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ReportableError;
 use crate::metrics::{
    ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType,
 };
 use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute, retry_after, should_retry};
-use crate::proxy::wake_compute::{WakeComputeBackend, wake_compute};
+use crate::proxy::wake_compute::wake_compute;
 use crate::types::Host;

 /// If we couldn't connect, a cached connection info might be to blame
 /// (e.g. the compute node's address might've changed at the wrong time).
 /// Invalidate the cache entry (if any) to prevent subsequent errors.
-#[tracing::instrument(skip_all)]
+#[tracing::instrument(name = "invalidate_cache", skip_all)]
 pub(crate) fn invalidate_cache(node_info: control_plane::CachedNodeInfo) -> NodeInfo {
    let is_cached = node_info.cached();
    if is_cached {
@@ -49,6 +49,14 @@ pub(crate) trait ConnectMechanism {
    ) -> Result<Self::Connection, Self::ConnectError>;
 }

+#[async_trait]
+pub(crate) trait ComputeConnectBackend {
+    async fn wake_compute(
+        &self,
+        ctx: &RequestContext,
+    ) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError>;
+}
+
 pub(crate) struct TcpMechanism {
    pub(crate) auth: AuthInfo,
    /// connect_to_compute concurrency lock
@@ -83,7 +91,7 @@ impl ConnectMechanism for TcpMechanism {

 /// Try to connect to the compute node, retrying if necessary.
 #[tracing::instrument(skip_all)]
-pub(crate) async fn connect_to_compute<M: ConnectMechanism, B: WakeComputeBackend>(
+pub(crate) async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
    ctx: &RequestContext,
    mechanism: &M,
    user_info: &B,
--- a/proxy/src/pglb/mod.rs
+++ b/proxy/src/pglb/mod.rs
@@ -1,3 +1,4 @@
+pub mod connect_compute;
 pub mod copy_bidirectional;
 pub mod handshake;
 pub mod inprocess;
--- a/proxy/src/pglb/passthrough.rs
+++ b/proxy/src/pglb/passthrough.rs
@@ -1,17 +1,15 @@
-use std::convert::Infallible;
-
+use futures::FutureExt;
 use smol_str::SmolStr;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::debug;
 use utils::measured_stream::MeasuredStream;

 use super::copy_bidirectional::ErrorSource;
-use crate::compute::MaybeRustlsStream;
+use crate::cancellation;
+use crate::compute::PostgresConnection;
+use crate::config::ComputeConfig;
 use crate::control_plane::messages::MetricsAuxInfo;
-use crate::metrics::{
-    Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard,
-    NumDbConnectionsGuard,
-};
+use crate::metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard};
 use crate::stream::Stream;
 use crate::usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS};

@@ -66,20 +64,40 @@ pub(crate) async fn proxy_pass(

 pub(crate) struct ProxyPassthrough<S> {
    pub(crate) client: Stream<S>,
-    pub(crate) compute: MaybeRustlsStream,
-
+    pub(crate) compute: PostgresConnection,
    pub(crate) aux: MetricsAuxInfo,
+    pub(crate) session_id: uuid::Uuid,
    pub(crate) private_link_id: Option<SmolStr>,
-
-    pub(crate) _cancel_on_shutdown: tokio::sync::oneshot::Sender<Infallible>,
+    pub(crate) cancel: cancellation::Session,

    pub(crate) _req: NumConnectionRequestsGuard<'static>,
    pub(crate) _conn: NumClientConnectionsGuard<'static>,
-    pub(crate) _db_conn: NumDbConnectionsGuard<'static>,
 }

 impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
-    pub(crate) async fn proxy_pass(self) -> Result<(), ErrorSource> {
-        proxy_pass(self.client, self.compute, self.aux, self.private_link_id).await
+    pub(crate) async fn proxy_pass(
+        self,
+        compute_config: &ComputeConfig,
+    ) -> Result<(), ErrorSource> {
+        let res = proxy_pass(
+            self.client,
+            self.compute.stream,
+            self.aux,
+            self.private_link_id,
+        )
+        .await;
+        if let Err(err) = self
+            .compute
+            .cancel_closure
+            .try_cancel_query(compute_config)
+            .boxed()
+            .await
+        {
+            tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database");
+        }
+
+        drop(self.cancel.remove_cancel_key()); // we don't need a result. If the queue is full, we just log the error
+
+        res
    }
 }
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -1,10 +1,8 @@
 #[cfg(test)]
 mod tests;

-pub(crate) mod connect_compute;
 pub(crate) mod retry;
 pub(crate) mod wake_compute;
-
 use std::sync::Arc;

 use futures::FutureExt;
@@ -23,16 +21,15 @@ use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
 use crate::context::RequestContext;
 use crate::error::{ReportableError, UserFacingError};
 use crate::metrics::{Metrics, NumClientConnectionsGuard};
+use crate::pglb::connect_compute::{TcpMechanism, connect_to_compute};
 pub use crate::pglb::copy_bidirectional::{ErrorSource, copy_bidirectional_client_compute};
 use crate::pglb::handshake::{HandshakeData, HandshakeError, handshake};
 use crate::pglb::passthrough::ProxyPassthrough;
 use crate::pqproto::{BeMessage, CancelKeyData, StartupMessageParams};
 use crate::protocol2::{ConnectHeader, ConnectionInfo, ConnectionInfoExtra, read_proxy_protocol};
-use crate::proxy::connect_compute::{TcpMechanism, connect_to_compute};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::stream::{PqStream, Stream};
 use crate::types::EndpointCacheKey;
-use crate::util::run_until_cancelled;
 use crate::{auth, compute};

 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
@@ -49,6 +46,21 @@ impl ReportableError for TlsRequired {

 impl UserFacingError for TlsRequired {}

+pub async fn run_until_cancelled<F: std::future::Future>(
+    f: F,
+    cancellation_token: &CancellationToken,
+) -> Option<F::Output> {
+    match futures::future::select(
+        std::pin::pin!(f),
+        std::pin::pin!(cancellation_token.cancelled()),
+    )
+    .await
+    {
+        futures::future::Either::Left((f, _)) => Some(f),
+        futures::future::Either::Right(((), _)) => None,
+    }
+}
+
 pub async fn task_main(
    config: &'static ProxyConfig,
    auth_backend: &'static auth::Backend<'static, ()>,
@@ -155,7 +167,7 @@ pub async fn task_main(
                Ok(Some(p)) => {
                    ctx.set_success();
                    let _disconnect = ctx.log_connect();
-                    match p.proxy_pass().await {
+                    match p.proxy_pass(&config.connect_to_compute).await {
                        Ok(()) => {}
                        Err(ErrorSource::Client(e)) => {
                            warn!(
@@ -346,12 +358,12 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
        }
    };

-    let (cplane, creds) = match user_info {
-        auth::Backend::ControlPlane(cplane, creds) => (cplane, creds),
+    let creds = match &user_info {
+        auth::Backend::ControlPlane(_, creds) => creds,
        auth::Backend::Local(_) => unreachable!("local proxy does not run tcp proxy service"),
    };
    let params_compat = creds.info.options.get(NeonOptions::PARAMS_COMPAT).is_some();
-    let mut auth_info = compute::AuthInfo::with_auth_keys(creds.keys);
+    let mut auth_info = compute::AuthInfo::with_auth_keys(&creds.keys);
    auth_info.set_startup_params(&params, params_compat);

    let res = connect_to_compute(
@@ -361,7 +373,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
            auth: auth_info,
            locks: &config.connect_compute_locks,
        },
-        &auth::Backend::ControlPlane(cplane, creds.info),
+        &user_info,
        config.wake_compute_retry_config,
        &config.connect_to_compute,
    )
@@ -372,19 +384,13 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
        Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?,
    };

-    let session = cancellation_handler.get_key();
+    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
+    let session = cancellation_handler_clone.get_key();

+    session.write_cancel_key(node.cancel_closure.clone())?;
    prepare_client_connection(&node, *session.key(), &mut stream);
    let stream = stream.flush_and_into_inner().await?;

-    let session_id = ctx.session_id();
-    let (cancel_on_shutdown, cancel) = tokio::sync::oneshot::channel();
-    tokio::spawn(async move {
-        session
-            .maintain_cancel_key(session_id, cancel, &node.cancel_closure)
-            .await;
-    });
-
    let private_link_id = match ctx.extra() {
        Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()),
        Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()),
@@ -393,16 +399,13 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(

    Ok(Some(ProxyPassthrough {
        client: stream,
-        compute: node.stream,
-
-        aux: node.aux,
+        aux: node.aux.clone(),
        private_link_id,
-
-        _cancel_on_shutdown: cancel_on_shutdown,
-
+        compute: node,
+        session_id: ctx.session_id(),
+        cancel: session,
        _req: request_gauge,
        _conn: conn_gauge,
-        _db_conn: node.guage,
    }))
 }

--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -8,7 +8,7 @@ use std::time::Duration;
 use anyhow::{Context, bail};
 use async_trait::async_trait;
 use http::StatusCode;
-use postgres_client::config::SslMode;
+use postgres_client::config::{AuthKeys, ScramKeys, SslMode};
 use postgres_client::tls::{MakeTlsConnect, NoTls};
 use retry::{ShouldRetryWakeCompute, retry_after};
 use rstest::rstest;
@@ -19,13 +19,15 @@ use tracing_test::traced_test;

 use super::retry::CouldRetry;
 use super::*;
-use crate::auth::backend::{ComputeUserInfo, MaybeOwned};
+use crate::auth::backend::{
+    ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned,
+};
 use crate::config::{ComputeConfig, RetryConfig};
 use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient};
 use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status};
 use crate::control_plane::{self, CachedNodeInfo, NodeInfo, NodeInfoCache};
 use crate::error::ErrorKind;
-use crate::proxy::connect_compute::ConnectMechanism;
+use crate::pglb::connect_compute::ConnectMechanism;
 use crate::tls::client_config::compute_client_config_with_certs;
 use crate::tls::server_config::CertResolver;
 use crate::types::{BranchId, EndpointId, ProjectId};
@@ -573,13 +575,19 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn

 fn helper_create_connect_info(
    mechanism: &TestConnectMechanism,
-) -> auth::Backend<'static, ComputeUserInfo> {
+) -> auth::Backend<'static, ComputeCredentials> {
    auth::Backend::ControlPlane(
        MaybeOwned::Owned(ControlPlaneClient::Test(Box::new(mechanism.clone()))),
-        ComputeUserInfo {
-            endpoint: "endpoint".into(),
-            user: "user".into(),
-            options: NeonOptions::parse_options_raw(""),
+        ComputeCredentials {
+            info: ComputeUserInfo {
+                endpoint: "endpoint".into(),
+                user: "user".into(),
+                options: NeonOptions::parse_options_raw(""),
+            },
+            keys: ComputeCredentialKeys::AuthKeys(AuthKeys::ScramSha256(ScramKeys {
+                client_key: [0; 32],
+                server_key: [0; 32],
+            })),
        },
    )
 }
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,4 +1,3 @@
-use async_trait::async_trait;
 use tracing::{error, info};

 use crate::config::RetryConfig;
@@ -9,6 +8,7 @@ use crate::error::ReportableError;
 use crate::metrics::{
    ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType,
 };
+use crate::pglb::connect_compute::ComputeConnectBackend;
 use crate::proxy::retry::{retry_after, should_retry};

 // Use macro to retain original callsite.
@@ -23,12 +23,7 @@ macro_rules! log_wake_compute_error {
    };
 }

-#[async_trait]
-pub(crate) trait WakeComputeBackend {
-    async fn wake_compute(&self, ctx: &RequestContext) -> Result<CachedNodeInfo, WakeComputeError>;
-}
-
-pub(crate) async fn wake_compute<B: WakeComputeBackend>(
+pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
    num_retries: &mut u32,
    ctx: &RequestContext,
    api: &B,
--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -69,8 +69,9 @@ pub struct LeakyBucketConfig {
    pub max: f64,
 }

+#[cfg(test)]
 impl LeakyBucketConfig {
-    pub fn new(rps: f64, max: f64) -> Self {
+    pub(crate) fn new(rps: f64, max: f64) -> Self {
        assert!(rps > 0.0, "rps must be positive");
        assert!(max > 0.0, "max must be positive");
        Self { rps, max }
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -12,10 +12,11 @@ use rand::{Rng, SeedableRng};
 use tokio::time::{Duration, Instant};
 use tracing::info;

-use super::LeakyBucketConfig;
 use crate::ext::LockExt;
 use crate::intern::EndpointIdInt;

+use super::LeakyBucketConfig;
+
 pub struct GlobalRateLimiter {
    data: Vec<RateBucket>,
    info: Vec<RateBucketInfo>,
--- a/proxy/src/redis/keys.rs
+++ b/proxy/src/redis/keys.rs
@@ -1,4 +1,8 @@
-use crate::pqproto::CancelKeyData;
+use std::io::ErrorKind;
+
+use anyhow::Ok;
+
+use crate::pqproto::{CancelKeyData, id_to_cancel_key};

 pub mod keyspace {
    pub const CANCEL_PREFIX: &str = "cancel";
@@ -19,12 +23,39 @@ impl KeyPrefix {
            }
        }
    }
+
+    #[allow(dead_code)]
+    pub(crate) fn as_str(&self) -> &'static str {
+        match self {
+            KeyPrefix::Cancel(_) => keyspace::CANCEL_PREFIX,
+        }
+    }
+}
+
+#[allow(dead_code)]
+pub(crate) fn parse_redis_key(key: &str) -> anyhow::Result<KeyPrefix> {
+    let (prefix, key_str) = key.split_once(':').ok_or_else(|| {
+        anyhow::anyhow!(std::io::Error::new(
+            ErrorKind::InvalidData,
+            "missing prefix"
+        ))
+    })?;
+
+    match prefix {
+        keyspace::CANCEL_PREFIX => {
+            let id = u64::from_str_radix(key_str, 16)?;
+
+            Ok(KeyPrefix::Cancel(id_to_cancel_key(id)))
+        }
+        _ => Err(anyhow::anyhow!(std::io::Error::new(
+            ErrorKind::InvalidData,
+            "unknown prefix"
+        ))),
+    }
 }

 #[cfg(test)]
 mod tests {
-    use crate::pqproto::id_to_cancel_key;
-
    use super::*;

    #[test]
@@ -34,4 +65,16 @@ mod tests {
        let redis_key = cancel_key.build_redis_key();
        assert_eq!(redis_key, "cancel:30390000d431");
    }
+
+    #[test]
+    fn test_parse_redis_key() {
+        let redis_key = "cancel:30390000d431";
+        let key: KeyPrefix = parse_redis_key(redis_key).expect("Failed to parse key");
+
+        let ref_key = id_to_cancel_key(12345 << 32 | 54321);
+
+        assert_eq!(key.as_str(), KeyPrefix::Cancel(ref_key).as_str());
+        let KeyPrefix::Cancel(cancel_key) = key;
+        assert_eq!(ref_key, cancel_key);
+    }
 }
--- a/proxy/src/redis/kv_ops.rs
+++ b/proxy/src/redis/kv_ops.rs
@@ -1,6 +1,3 @@
-use std::time::Duration;
-
-use futures::FutureExt;
 use redis::aio::ConnectionLike;
 use redis::{Cmd, FromRedisValue, Pipeline, RedisResult};

@@ -38,11 +35,14 @@ impl RedisKVClient {
    }

    pub async fn try_connect(&mut self) -> anyhow::Result<()> {
-        self.client
-            .connect()
-            .boxed()
-            .await
-            .inspect_err(|e| tracing::error!("failed to connect to redis: {e}"))
+        match self.client.connect().await {
+            Ok(()) => {}
+            Err(e) => {
+                tracing::error!("failed to connect to redis: {e}");
+                return Err(e);
+            }
+        }
+        Ok(())
    }

    pub(crate) async fn query<T: FromRedisValue>(
@@ -54,25 +54,15 @@ impl RedisKVClient {
            return Err(anyhow::anyhow!("Rate limit exceeded"));
        }

-        let e = match q.query(&mut self.client).await {
+        match q.query(&mut self.client).await {
            Ok(t) => return Ok(t),
-            Err(e) => e,
-        };
-
-        tracing::error!("failed to run query: {e}");
-        match e.retry_method() {
-            redis::RetryMethod::Reconnect => {
-                tracing::info!("Redis client is disconnected. Reconnecting...");
-                self.try_connect().await?;
+            Err(e) => {
+                tracing::error!("failed to run query: {e}");
            }
-            redis::RetryMethod::RetryImmediately => {}
-            redis::RetryMethod::WaitAndRetry => {
-                // somewhat arbitrary.
-                tokio::time::sleep(Duration::from_millis(100)).await;
-            }
-            _ => Err(e)?,
        }

+        tracing::info!("Redis client is disconnected. Reconnecting...");
+        self.try_connect().await?;
        Ok(q.query(&mut self.client).await?)
    }
 }
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -21,7 +21,7 @@ use super::conn_pool_lib::{Client, ConnInfo, EndpointConnPool, GlobalConnPool};
 use super::http_conn_pool::{self, HttpConnPool, Send, poll_http2_client};
 use super::local_conn_pool::{self, EXT_NAME, EXT_SCHEMA, EXT_VERSION, LocalConnPool};
 use crate::auth::backend::local::StaticAuthRules;
-use crate::auth::backend::{ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo};
+use crate::auth::backend::{ComputeCredentials, ComputeUserInfo};
 use crate::auth::{self, AuthError};
 use crate::compute_ctl::{
    ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest,
@@ -34,7 +34,7 @@ use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError};
 use crate::control_plane::locks::ApiLocks;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::intern::EndpointIdInt;
-use crate::proxy::connect_compute::ConnectMechanism;
+use crate::pglb::connect_compute::ConnectMechanism;
 use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::types::{EndpointId, Host, LOCAL_PROXY_SUFFIX};
@@ -68,20 +68,17 @@ impl PoolingBackend {
            self.config.authentication_config.is_vpc_acccess_proxy,
        )?;

-        access_control.connection_attempt_rate_limit(
-            ctx,
-            &user_info.endpoint,
-            &self.endpoint_rate_limiter,
-        )?;
-
+        let ep = EndpointIdInt::from(&user_info.endpoint);
+        let rate_limit_config = None;
+        if !self.endpoint_rate_limiter.check(ep, rate_limit_config, 1) {
+            return Err(AuthError::too_many_connections());
+        }
        let role_access = backend.get_role_secret(ctx).await?;
        let Some(secret) = role_access.secret else {
            // If we don't have an authentication secret, for the http flow we can just return an error.
            info!("authentication info not found");
            return Err(AuthError::password_failed(&*user_info.user));
        };
-
-        let ep = EndpointIdInt::from(&user_info.endpoint);
        let auth_outcome = crate::auth::validate_password_and_exchange(
            &self.config.authentication_config.thread_pool,
            ep,
@@ -183,15 +180,14 @@ impl PoolingBackend {
        let conn_id = uuid::Uuid::new_v4();
        tracing::Span::current().record("conn_id", display(conn_id));
        info!(%conn_id, "pool: opening a new connection '{conn_info}'");
-        let backend = self.auth_backend.as_ref().map(|()| keys.info);
-        crate::proxy::connect_compute::connect_to_compute(
+        let backend = self.auth_backend.as_ref().map(|()| keys);
+        crate::pglb::connect_compute::connect_to_compute(
            ctx,
            &TokioMechanism {
                conn_id,
                conn_info,
                pool: self.pool.clone(),
                locks: &self.config.connect_compute_locks,
-                keys: keys.keys,
            },
            &backend,
            self.config.wake_compute_retry_config,
@@ -218,15 +214,18 @@ impl PoolingBackend {
        let conn_id = uuid::Uuid::new_v4();
        tracing::Span::current().record("conn_id", display(conn_id));
        debug!(%conn_id, "pool: opening a new connection '{conn_info}'");
-        let backend = self.auth_backend.as_ref().map(|()| ComputeUserInfo {
-            user: conn_info.user_info.user.clone(),
-            endpoint: EndpointId::from(format!(
-                "{}{LOCAL_PROXY_SUFFIX}",
-                conn_info.user_info.endpoint.normalize()
-            )),
-            options: conn_info.user_info.options.clone(),
+        let backend = self.auth_backend.as_ref().map(|()| ComputeCredentials {
+            info: ComputeUserInfo {
+                user: conn_info.user_info.user.clone(),
+                endpoint: EndpointId::from(format!(
+                    "{}{LOCAL_PROXY_SUFFIX}",
+                    conn_info.user_info.endpoint.normalize()
+                )),
+                options: conn_info.user_info.options.clone(),
+            },
+            keys: crate::auth::backend::ComputeCredentialKeys::None,
        });
-        crate::proxy::connect_compute::connect_to_compute(
+        crate::pglb::connect_compute::connect_to_compute(
            ctx,
            &HyperMechanism {
                conn_id,
@@ -496,7 +495,6 @@ struct TokioMechanism {
    pool: Arc<GlobalConnPool<postgres_client::Client, EndpointConnPool<postgres_client::Client>>>,
    conn_info: ConnInfo,
    conn_id: uuid::Uuid,
-    keys: ComputeCredentialKeys,

    /// connect_to_compute concurrency lock
    locks: &'static ApiLocks<Host>,
@@ -522,10 +520,6 @@ impl ConnectMechanism for TokioMechanism {
            .dbname(&self.conn_info.dbname)
            .connect_timeout(compute_config.timeout);

-        if let ComputeCredentialKeys::AuthKeys(auth_keys) = self.keys {
-            config.auth_keys(auth_keys);
-        }
-
        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
        let res = config.connect(compute_config).await;
        drop(pause);
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -50,10 +50,10 @@ use crate::context::RequestContext;
 use crate::ext::TaskExt;
 use crate::metrics::Metrics;
 use crate::protocol2::{ConnectHeader, ConnectionInfo, read_proxy_protocol};
+use crate::proxy::run_until_cancelled;
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
 use crate::serverless::http_util::{api_error_into_response, json_response};
-use crate::util::run_until_cancelled;

 pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api";
 pub(crate) const AUTH_BROKER_SNI: &str = "apiauth";
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -41,11 +41,10 @@ use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::http::{ReadBodyError, read_body_with_limit};
 use crate::metrics::{HttpDirection, Metrics, SniGroup, SniKind};
 use crate::pqproto::StartupMessageParams;
-use crate::proxy::NeonOptions;
+use crate::proxy::{NeonOptions, run_until_cancelled};
 use crate::serverless::backend::HttpConnError;
 use crate::types::{DbName, RoleName};
 use crate::usage_metrics::{MetricCounter, MetricCounterRecorder};
-use crate::util::run_until_cancelled;

 #[derive(serde::Deserialize)]
 #[serde(rename_all = "camelCase")]
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -167,7 +167,7 @@ pub(crate) async fn serve_websocket(
        Ok(Some(p)) => {
            ctx.set_success();
            ctx.log_connect();
-            match p.proxy_pass().await {
+            match p.proxy_pass(&config.connect_to_compute).await {
                Ok(()) => Ok(()),
                Err(ErrorSource::Client(err)) => Err(err).context("client"),
                Err(ErrorSource::Compute(err)) => Err(err).context("compute"),
--- a/proxy/src/util.rs
+++ b/proxy/src/util.rs
@@ -1,14 +0,0 @@
-use std::pin::pin;
-
-use futures::future::{Either, select};
-use tokio_util::sync::CancellationToken;
-
-pub async fn run_until_cancelled<F: Future>(
-    f: F,
-    cancellation_token: &CancellationToken,
-) -> Option<F::Output> {
-    match select(pin!(f), pin!(cancellation_token.cancelled())).await {
-        Either::Left((f, _)) => Some(f),
-        Either::Right(((), _)) => None,
-    }
-}
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1036,8 +1036,9 @@ async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiErro

    let state = get_state(&req);
    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+    let drain_all: bool = parse_query_param(&req, "drain_all")?.unwrap_or(false);

-    state.service.start_node_drain(node_id).await?;
+    state.service.start_node_drain(node_id, drain_all).await?;

    json_response(StatusCode::ACCEPTED, ())
 }
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -7611,6 +7611,7 @@ impl Service {
    pub(crate) async fn start_node_drain(
        self: &Arc<Self>,
        node_id: NodeId,
+        drain_all: bool,
    ) -> Result<(), ApiError> {
        let (ongoing_op, node_available, node_policy, schedulable_nodes_count) = {
            let locked = self.inner.read().unwrap();
@@ -7684,7 +7685,7 @@ impl Service {
                        }

                        tracing::info!("Drain background operation starting");
-                        let res = service.drain_node(node_id, cancel).await;
+                        let res = service.drain_node(node_id, drain_all, cancel).await;
                        match res {
                            Ok(()) => {
                                tracing::info!("Drain background operation completed successfully");
@@ -8850,9 +8851,30 @@ impl Service {
        }
    }

-    /// Drain a node by moving the shards attached to it as primaries.
-    /// This is a long running operation and it should run as a separate Tokio task.
+    /// Drain a node by moving shards that are attached to it, either as primaries or secondaries.
+    /// When `drain_all` is false, only primary attachments are moved - this is used during node
+    /// deployment when the node is expected to return to service soon. When `drain_all` is true,
+    /// both primary and secondary attachments are moved - this is used when permanently removing
+    /// a node.
+    ///
+    /// This is a long running operation that should be spawned as a separate Tokio task.
    pub(crate) async fn drain_node(
+        self: &Arc<Self>,
+        node_id: NodeId,
+        drain_all: bool,
+        cancel: CancellationToken,
+    ) -> Result<(), OperationError> {
+        self.drain_primary_attachments(node_id, cancel.clone())
+            .await?;
+        if drain_all {
+            self.drain_secondary_attachments(node_id, cancel).await?;
+        }
+        Ok(())
+    }
+
+    /// Drain a node by moving the shards attached to it as primaries.
+    /// This is a long running operation
+    async fn drain_primary_attachments(
        self: &Arc<Self>,
        node_id: NodeId,
        cancel: CancellationToken,
@@ -8868,10 +8890,11 @@ impl Service {
        // to not stall the operation when a cold secondary is encountered.
        const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(30);
        const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5);
-        let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal)
-            .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT)
-            .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT)
-            .build();
+        let reconciler_config: ReconcilerConfig =
+            ReconcilerConfigBuilder::new(ReconcilerPriority::Normal)
+                .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT)
+                .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT)
+                .build();

        let mut waiters = Vec::new();

@@ -9048,6 +9071,14 @@ impl Service {
        Ok(())
    }

+    async fn drain_secondary_attachments(
+        self: &Arc<Self>,
+        _node_id: NodeId,
+        _cancel: CancellationToken,
+    ) -> Result<(), OperationError> {
+        Ok(())
+    }
+
    /// Create a node fill plan (pick secondaries to promote), based on:
    /// 1. Shards which have a secondary on this node, and this node is in their home AZ, and are currently attached to a node
    ///    outside their home AZ, should be migrated back here.
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -69,17 +69,15 @@ class EndpointHttpClient(requests.Session):
        json: dict[str, str] = res.json()
        return json

-    def prewarm_lfc(self, from_endpoint_id: str | None = None):
-        url: str = f"http://localhost:{self.external_port}/lfc/prewarm"
-        params = {"from_endpoint": from_endpoint_id} if from_endpoint_id else dict()
-        self.post(url, params=params).raise_for_status()
+    def prewarm_lfc(self):
+        self.post(f"http://localhost:{self.external_port}/lfc/prewarm").raise_for_status()

        def prewarmed():
            json = self.prewarm_lfc_status()
            status, err = json["status"], json.get("error")
            assert status == "completed", f"{status}, error {err}"

-        wait_until(prewarmed, timeout=60)
+        wait_until(prewarmed)

    def offload_lfc(self):
        url = f"http://localhost:{self.external_port}/lfc/offload"
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -129,18 +129,6 @@ class NeonAPI:

        return cast("dict[str, Any]", resp.json())

-    def get_project_limits(self, project_id: str) -> dict[str, Any]:
-        resp = self.__request(
-            "GET",
-            f"/projects/{project_id}/limits",
-            headers={
-                "Accept": "application/json",
-                "Content-Type": "application/json",
-            },
-        )
-
-        return cast("dict[str, Any]", resp.json())
-
    def delete_project(
        self,
        project_id: str,
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2062,11 +2062,16 @@ class NeonStorageController(MetricsGetter, LogUtils):
            headers=self.headers(TokenScope.ADMIN),
        )

-    def node_drain(self, node_id):
-        log.info(f"node_drain({node_id})")
+    def node_drain(self, node_id: int, drain_all: bool | None = None):
+        log.info(f"node_drain({node_id}, drain_all={drain_all})")
+
+        url = f"{self.api}/control/v1/node/{node_id}/drain"
+        if drain_all is not None:
+            url += f"?drain_all={str(drain_all).lower()}"
+
        self.request(
            "PUT",
-            f"{self.api}/control/v1/node/{node_id}/drain",
+            url,
            headers=self.headers(TokenScope.INFRA),
        )

@@ -4046,16 +4051,6 @@ def static_proxy(
        "CREATE TABLE neon_control_plane.endpoints (endpoint_id VARCHAR(255) PRIMARY KEY, allowed_ips VARCHAR(255))"
    )

-    vanilla_pg.stop()
-    vanilla_pg.edit_hba(
-        [
-            "local all all              trust",
-            "host  all all 127.0.0.1/32 scram-sha-256",
-            "host  all all ::1/128      scram-sha-256",
-        ]
-    )
-    vanilla_pg.start()
-
    proxy_port = port_distributor.get_port()
    mgmt_port = port_distributor.get_port()
    http_port = port_distributor.get_port()
--- a/test_runner/random_ops/test_random_ops.py
+++ b/test_runner/random_ops/test_random_ops.py
@@ -45,8 +45,6 @@ class NeonEndpoint:
        if self.branch.connect_env:
            self.connect_env = self.branch.connect_env.copy()
            self.connect_env["PGHOST"] = self.host
-        if self.type == "read_only":
-            self.project.read_only_endpoints_total += 1

    def delete(self):
        self.project.delete_endpoint(self.id)
@@ -230,13 +228,8 @@ class NeonProject:
        self.benchmarks: dict[str, subprocess.Popen[Any]] = {}
        self.restore_num: int = 0
        self.restart_pgbench_on_console_errors: bool = False
-        self.limits: dict[str, Any] = self.get_limits()["limits"]
-        self.read_only_endpoints_total: int = 0

-    def get_limits(self) -> dict[str, Any]:
-        return self.neon_api.get_project_limits(self.id)
-
-    def delete(self) -> None:
+    def delete(self):
        self.neon_api.delete_project(self.id)

    def create_branch(self, parent_id: str | None = None) -> NeonBranch | None:
@@ -289,7 +282,6 @@ class NeonProject:
        self.neon_api.delete_endpoint(self.id, endpoint_id)
        self.endpoints[endpoint_id].branch.endpoints.pop(endpoint_id)
        self.endpoints.pop(endpoint_id)
-        self.read_only_endpoints_total -= 1
        self.wait()

    def start_benchmark(self, target: str, clients: int = 10) -> subprocess.Popen[Any]:
@@ -377,64 +369,49 @@ def setup_class(
        print(f"::warning::Retried on 524 error {neon_api.retries524} times")
    if neon_api.retries4xx > 0:
        print(f"::warning::Retried on 4xx error {neon_api.retries4xx} times")
-    log.info("Removing the project %s", project.id)
+    log.info("Removing the project")
    project.delete()


-def do_action(project: NeonProject, action: str) -> bool:
+def do_action(project: NeonProject, action: str) -> None:
    """
    Runs the action
    """
    log.info("Action: %s", action)
    if action == "new_branch":
        log.info("Trying to create a new branch")
-        if 0 <= project.limits["max_branches"] <= len(project.branches):
-            log.info(
-                "Maximum branch limit exceeded (%s of %s)",
-                len(project.branches),
-                project.limits["max_branches"],
-            )
-            return False
        parent = project.branches[
            random.choice(list(set(project.branches.keys()) - project.reset_branches))
        ]
        log.info("Parent: %s", parent)
        child = parent.create_child_branch()
        if child is None:
-            return False
+            return
        log.info("Created branch %s", child)
        child.start_benchmark()
    elif action == "delete_branch":
        if project.leaf_branches:
-            target: NeonBranch = random.choice(list(project.leaf_branches.values()))
+            target = random.choice(list(project.leaf_branches.values()))
            log.info("Trying to delete branch %s", target)
            target.delete()
        else:
            log.info("Leaf branches not found, skipping")
-            return False
    elif action == "new_ro_endpoint":
-        if 0 <= project.limits["max_read_only_endpoints"] <= project.read_only_endpoints_total:
-            log.info(
-                "Maximum read only endpoint limit exceeded (%s of %s)",
-                project.read_only_endpoints_total,
-                project.limits["max_read_only_endpoints"],
-            )
-            return False
        ep = random.choice(
            [br for br in project.branches.values() if br.id not in project.reset_branches]
        ).create_ro_endpoint()
        log.info("Created the RO endpoint with id %s branch: %s", ep.id, ep.branch.id)
        ep.start_benchmark()
    elif action == "delete_ro_endpoint":
-        if project.read_only_endpoints_total == 0:
-            log.info("no read_only endpoints present, skipping")
-            return False
        ro_endpoints: list[NeonEndpoint] = [
            endpoint for endpoint in project.endpoints.values() if endpoint.type == "read_only"
        ]
-        target_ep: NeonEndpoint = random.choice(ro_endpoints)
-        target_ep.delete()
-        log.info("endpoint %s deleted", target_ep.id)
+        if ro_endpoints:
+            target_ep: NeonEndpoint = random.choice(ro_endpoints)
+            target_ep.delete()
+            log.info("endpoint %s deleted", target_ep.id)
+        else:
+            log.info("no read_only endpoints present, skipping")
    elif action == "restore_random_time":
        if project.leaf_branches:
            br: NeonBranch = random.choice(list(project.leaf_branches.values()))
@@ -442,10 +419,8 @@ def do_action(project: NeonProject, action: str) -> bool:
            br.restore_random_time()
        else:
            log.info("No leaf branches found")
-            return False
    else:
        raise ValueError(f"The action {action} is unknown")
-    return True


@pytest.mark.timeout(7200)
@@ -482,9 +457,8 @@ def test_api_random(
    pg_bin.run(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=project.main_branch.connect_env)
    for _ in range(num_operations):
        log.info("Starting action #%s", _ + 1)
-        while not do_action(
+        do_action(
            project, random.choices([a[0] for a in ACTIONS], weights=[w[1] for w in ACTIONS])[0]
-        ):
-            log.info("Retrying...")
+        )
        project.check_all_benchmarks()
    assert True
--- a/test_runner/regress/test_lfc_prewarm.py
+++ b/test_runner/regress/test_lfc_prewarm.py
@@ -188,8 +188,7 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
            pg_cur.execute("select pg_reload_conf()")

            if query is LfcQueryMethod.COMPUTE_CTL:
-                # Same thing as prewarm_lfc(), testing other method
-                http_client.prewarm_lfc(endpoint.endpoint_id)
+                http_client.prewarm_lfc()
            else:
                pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,))

--- a/test_runner/regress/test_pageserver_restarts_under_workload.py
+++ b/test_runner/regress/test_pageserver_restarts_under_workload.py
@@ -16,7 +16,7 @@ if TYPE_CHECKING:

 # Test restarting page server, while safekeeper and compute node keep
 # running.
-def test_pageserver_restarts_under_workload(neon_simple_env: NeonEnv, pg_bin: PgBin):
+def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgBin):
    env = neon_simple_env
    env.create_branch("test_pageserver_restarts")
    endpoint = env.endpoints.create_start("test_pageserver_restarts")
@@ -28,11 +28,7 @@ def test_pageserver_restarts_under_workload(neon_simple_env: NeonEnv, pg_bin: Pg
        pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr])
        pg_bin.run_capture(["pgbench", f"-T{n_restarts}", connstr])

-    thread = threading.Thread(
-        target=run_pgbench,
-        args=(endpoint.connstr(options="-cstatement_timeout=360s"),),
-        daemon=True,
-    )
+    thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
    thread.start()

    for _ in range(n_restarts):
--- a/test_runner/regress/test_proxy_allowed_ips.py
+++ b/test_runner/regress/test_proxy_allowed_ips.py
@@ -19,15 +19,11 @@ TABLE_NAME = "neon_control_plane.endpoints"
 async def test_proxy_psql_allowed_ips(static_proxy: NeonProxy, vanilla_pg: VanillaPostgres):
    # Shouldn't be able to connect to this project
    vanilla_pg.safe_psql(
-        f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('private-project', '8.8.8.8')",
-        user="proxy",
-        password="password",
+        f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('private-project', '8.8.8.8')"
    )
    # Should be able to connect to this project
    vanilla_pg.safe_psql(
-        f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('generic-project', '::1,127.0.0.1')",
-        user="proxy",
-        password="password",
+        f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('generic-project', '::1,127.0.0.1')"
    )

    def check_cannot_connect(**kwargs):
@@ -64,9 +60,7 @@ async def test_proxy_http_allowed_ips(static_proxy: NeonProxy, vanilla_pg: Vanil

    # Shouldn't be able to connect to this project
    vanilla_pg.safe_psql(
-        f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('proxy', '8.8.8.8')",
-        user="proxy",
-        password="password",
+        f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('proxy', '8.8.8.8')"
    )

    def query(status: int, query: str, *args):
@@ -81,8 +75,6 @@ async def test_proxy_http_allowed_ips(static_proxy: NeonProxy, vanilla_pg: Vanil
    query(400, "select 1;")  # ip address is not allowed
    # Should be able to connect to this project
    vanilla_pg.safe_psql(
-        f"UPDATE {TABLE_NAME} SET allowed_ips = '8.8.8.8,127.0.0.1' WHERE endpoint_id = 'proxy'",
-        user="proxy",
-        password="password",
+        f"UPDATE {TABLE_NAME} SET allowed_ips = '8.8.8.8,127.0.0.1' WHERE endpoint_id = 'proxy'"
    )
    query(200, "select 1;")  # should work now
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3093,6 +3093,70 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB
    wait_until(reconfigure_node_again)


+def test_drain_with_secondary_locations(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_pageservers = 4
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    def get_pageserver_tenant_shards(node_id):
+        ps = env.get_pageserver(node_id)
+        locations = ps.http_client().tenant_list_locations()["tenant_shards"]
+        ret = []
+        for loc in locations:
+            ret.append(
+                {
+                    "tenant_shard_id": TenantShardId.parse(loc[0]),
+                    "mode": loc[1]["mode"],
+                }
+            )
+        return ret
+
+    def log_pageservers_state():
+        for ps in env.pageservers:
+            for tenant_shard in get_pageserver_tenant_shards(ps.id):
+                tenant_shard_id = tenant_shard["tenant_shard_id"]
+                mode = tenant_shard["mode"]
+                log.info(f"[PS {ps.id}] Seen {tenant_shard_id} in mode {mode}")
+
+    tenants = {}  # id → shard_count
+    for shard_count in [1, 2, 4, 8]:
+        id, _ = env.create_tenant(shard_count=shard_count, placement_policy='{"Attached": 1}')
+        tenants[id] = shard_count
+
+    log.info("Pageservers before reconcilation:")
+    log_pageservers_state()
+
+    env.storage_controller.reconcile_until_idle()
+
+    log.info("Pageservers before drain:")
+    log_pageservers_state()
+
+    node_id = env.pageservers[0].id
+
+    env.storage_controller.warm_up_all_secondaries()
+    env.storage_controller.retryable_node_operation(
+        lambda ps_id: env.storage_controller.node_drain(ps_id, drain_all=True),
+        node_id,
+        max_attempts=3,
+        backoff=2,
+    )
+
+    env.storage_controller.poll_node_status(
+        node_id,
+        PageserverAvailability.ACTIVE,
+        PageserverSchedulingPolicy.PAUSE_FOR_RESTART,
+        max_attempts=6,
+        backoff=5,
+    )
+
+    log.info("Pageservers after drain:")
+    log_pageservers_state()
+
+    shards = get_pageserver_tenant_shards(node_id)
+    assert shards == []
+
+
 def test_ps_unavailable_after_delete(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.num_pageservers = 3
Author	SHA1	Message	Date
Aleksandr Sarantsev	53bdbdf71f	another things	2025-06-11 09:19:07 +04:00
Aleksandr Sarantsev	652c7203b5	Merge branch 'main' into ephemeralsad/graceful-draining	2025-06-09 09:54:13 +04:00
Aleksandr Sarantsev	1d3fd5bfc7	Better storcon API	2025-06-06 10:56:58 +04:00
Aleksandr Sarantsev	cc53ed4e43	Merge branch 'main' into ephemeralsad/graceful-draining	2025-06-05 18:25:29 +04:00
Aleksandr Sarantsev	61a3258e5d	Add graceful flag for storcon	2025-06-02 17:07:30 +04:00
Aleksandr Sarantsev	24e627e44c	Graceful draining	2025-05-30 17:54:26 +04:00