From cb9ab7463c27f30532f944ff9d3adb0636e42364 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 14 Oct 2024 12:25:55 +0200
Subject: [PATCH 01/57] proxy: split out the console-redirect backend flow
 (#9270)

removes the ConsoleRedirect backend from the main auth::Backends enum,
copy-paste the existing crate::proxy::task_main structure to use the
ConsoleRedirectBackend exclusively.

This makes the logic a bit simpler at the cost of some fairly trivial
code duplication.
---
 proxy/src/auth/backend/console_redirect.rs |  37 +++-
 proxy/src/auth/backend/mod.rs              |  72 ++-----
 proxy/src/bin/local_proxy.rs               |   2 +-
 proxy/src/bin/proxy.rs                     |  99 ++++++----
 proxy/src/console_redirect_proxy.rs        | 217 +++++++++++++++++++++
 proxy/src/lib.rs                           |   1 +
 proxy/src/proxy/mod.rs                     |   6 +-
 proxy/src/proxy/tests/mod.rs               |   2 +-
 proxy/src/serverless/backend.rs            |   7 +-
 proxy/src/serverless/mod.rs                |   2 +-
 proxy/src/serverless/websocket.rs          |   2 +-
 11 files changed, 334 insertions(+), 113 deletions(-)
 create mode 100644 proxy/src/console_redirect_proxy.rs
diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index 127be545e1..457410ec8c 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -1,18 +1,24 @@
 use crate::{
-    auth, compute,
+    auth,
+    cache::Cached,
+    compute,
     config::AuthenticationConfig,
     context::RequestMonitoring,
-    control_plane::{self, provider::NodeInfo},
+    control_plane::{self, provider::NodeInfo, CachedNodeInfo},
     error::{ReportableError, UserFacingError},
+    proxy::connect_compute::ComputeConnectBackend,
     stream::PqStream,
     waiters,
 };
+use async_trait::async_trait;
 use pq_proto::BeMessage as Be;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_postgres::config::SslMode;
 use tracing::{info, info_span};
 
+use super::ComputeCredentialKeys;
+
 #[derive(Debug, Error)]
 pub(crate) enum WebAuthError {
     #[error(transparent)]
@@ -25,6 +31,7 @@ pub(crate) enum WebAuthError {
     Io(#[from] std::io::Error),
 }
 
+#[derive(Debug)]
 pub struct ConsoleRedirectBackend {
     console_uri: reqwest::Url,
 }
@@ -66,17 +73,31 @@ impl ConsoleRedirectBackend {
         Self { console_uri }
     }
 
-    pub(super) fn url(&self) -> &reqwest::Url {
-        &self.console_uri
-    }
-
     pub(crate) async fn authenticate(
         &self,
         ctx: &RequestMonitoring,
         auth_config: &'static AuthenticationConfig,
         client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-    ) -> auth::Result<NodeInfo> {
-        authenticate(ctx, auth_config, &self.console_uri, client).await
+    ) -> auth::Result<ConsoleRedirectNodeInfo> {
+        authenticate(ctx, auth_config, &self.console_uri, client)
+            .await
+            .map(ConsoleRedirectNodeInfo)
+    }
+}
+
+pub struct ConsoleRedirectNodeInfo(pub(super) NodeInfo);
+
+#[async_trait]
+impl ComputeConnectBackend for ConsoleRedirectNodeInfo {
+    async fn wake_compute(
+        &self,
+        _ctx: &RequestMonitoring,
+    ) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
+        Ok(Cached::new_uncached(self.0.clone()))
+    }
+
+    fn get_keys(&self) -> &ComputeCredentialKeys {
+        &ComputeCredentialKeys::None
     }
 }
 
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 27c9f1876e..96e1a787ed 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -22,7 +22,7 @@ use crate::cache::Cached;
 use crate::context::RequestMonitoring;
 use crate::control_plane::errors::GetAuthInfoError;
 use crate::control_plane::provider::{CachedRoleSecret, ControlPlaneBackend};
-use crate::control_plane::{AuthSecret, NodeInfo};
+use crate::control_plane::AuthSecret;
 use crate::intern::EndpointIdInt;
 use crate::metrics::Metrics;
 use crate::proxy::connect_compute::ComputeConnectBackend;
@@ -66,11 +66,9 @@ impl<T> std::ops::Deref for MaybeOwned<'_, T> {
 /// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`],
 ///   this helps us provide the credentials only to those auth
 ///   backends which require them for the authentication process.
-pub enum Backend<'a, T, D> {
+pub enum Backend<'a, T> {
     /// Cloud API (V2).
     ControlPlane(MaybeOwned<'a, ControlPlaneBackend>, T),
-    /// Authentication via a web browser.
-    ConsoleRedirect(MaybeOwned<'a, ConsoleRedirectBackend>, D),
     /// Local proxy uses configured auth credentials and does not wake compute
     Local(MaybeOwned<'a, LocalBackend>),
 }
@@ -91,7 +89,7 @@ impl Clone for Box<dyn TestBackend> {
     }
 }
 
-impl std::fmt::Display for Backend<'_, (), ()> {
+impl std::fmt::Display for Backend<'_, ()> {
     fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
             Self::ControlPlane(api, ()) => match &**api {
@@ -107,46 +105,39 @@ impl std::fmt::Display for Backend<'_, (), ()> {
                 #[cfg(test)]
                 ControlPlaneBackend::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(),
             },
-            Self::ConsoleRedirect(backend, ()) => fmt
-                .debug_tuple("ConsoleRedirect")
-                .field(&backend.url().as_str())
-                .finish(),
             Self::Local(_) => fmt.debug_tuple("Local").finish(),
         }
     }
 }
 
-impl<T, D> Backend<'_, T, D> {
+impl<T> Backend<'_, T> {
     /// Very similar to [`std::option::Option::as_ref`].
     /// This helps us pass structured config to async tasks.
-    pub(crate) fn as_ref(&self) -> Backend<'_, &T, &D> {
+    pub(crate) fn as_ref(&self) -> Backend<'_, &T> {
         match self {
             Self::ControlPlane(c, x) => Backend::ControlPlane(MaybeOwned::Borrowed(c), x),
-            Self::ConsoleRedirect(c, x) => Backend::ConsoleRedirect(MaybeOwned::Borrowed(c), x),
             Self::Local(l) => Backend::Local(MaybeOwned::Borrowed(l)),
         }
     }
 }
 
-impl<'a, T, D> Backend<'a, T, D> {
+impl<'a, T> Backend<'a, T> {
     /// Very similar to [`std::option::Option::map`].
     /// Maps [`Backend<T>`] to [`Backend<R>`] by applying
     /// a function to a contained value.
-    pub(crate) fn map<R>(self, f: impl FnOnce(T) -> R) -> Backend<'a, R, D> {
+    pub(crate) fn map<R>(self, f: impl FnOnce(T) -> R) -> Backend<'a, R> {
         match self {
             Self::ControlPlane(c, x) => Backend::ControlPlane(c, f(x)),
-            Self::ConsoleRedirect(c, x) => Backend::ConsoleRedirect(c, x),
             Self::Local(l) => Backend::Local(l),
         }
     }
 }
-impl<'a, T, D, E> Backend<'a, Result<T, E>, D> {
+impl<'a, T, E> Backend<'a, Result<T, E>> {
     /// Very similar to [`std::option::Option::transpose`].
     /// This is most useful for error handling.
-    pub(crate) fn transpose(self) -> Result<Backend<'a, T, D>, E> {
+    pub(crate) fn transpose(self) -> Result<Backend<'a, T>, E> {
         match self {
             Self::ControlPlane(c, x) => x.map(|x| Backend::ControlPlane(c, x)),
-            Self::ConsoleRedirect(c, x) => Ok(Backend::ConsoleRedirect(c, x)),
             Self::Local(l) => Ok(Backend::Local(l)),
         }
     }
@@ -414,12 +405,11 @@ async fn authenticate_with_secret(
     classic::authenticate(ctx, info, client, config, secret).await
 }
 
-impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> {
+impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
     /// Get username from the credentials.
     pub(crate) fn get_user(&self) -> &str {
         match self {
             Self::ControlPlane(_, user_info) => &user_info.user,
-            Self::ConsoleRedirect(_, ()) => "web",
             Self::Local(_) => "local",
         }
     }
@@ -433,7 +423,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> {
         allow_cleartext: bool,
         config: &'static AuthenticationConfig,
         endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-    ) -> auth::Result<Backend<'a, ComputeCredentials, NodeInfo>> {
+    ) -> auth::Result<Backend<'a, ComputeCredentials>> {
         let res = match self {
             Self::ControlPlane(api, user_info) => {
                 info!(
@@ -454,14 +444,6 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> {
                 .await?;
                 Backend::ControlPlane(api, credentials)
             }
-            // NOTE: this auth backend doesn't use client credentials.
-            Self::ConsoleRedirect(backend, ()) => {
-                info!("performing web authentication");
-
-                let info = backend.authenticate(ctx, config, client).await?;
-
-                Backend::ConsoleRedirect(backend, info)
-            }
             Self::Local(_) => {
                 return Err(auth::AuthError::bad_auth_method("invalid for local proxy"))
             }
@@ -472,14 +454,13 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> {
     }
 }
 
-impl Backend<'_, ComputeUserInfo, &()> {
+impl Backend<'_, ComputeUserInfo> {
     pub(crate) async fn get_role_secret(
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         match self {
             Self::ControlPlane(api, user_info) => api.get_role_secret(ctx, user_info).await,
-            Self::ConsoleRedirect(_, ()) => Ok(Cached::new_uncached(None)),
             Self::Local(_) => Ok(Cached::new_uncached(None)),
         }
     }
@@ -492,21 +473,19 @@ impl Backend<'_, ComputeUserInfo, &()> {
             Self::ControlPlane(api, user_info) => {
                 api.get_allowed_ips_and_secret(ctx, user_info).await
             }
-            Self::ConsoleRedirect(_, ()) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
             Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
         }
     }
 }
 
 #[async_trait::async_trait]
-impl ComputeConnectBackend for Backend<'_, ComputeCredentials, NodeInfo> {
+impl ComputeConnectBackend for Backend<'_, ComputeCredentials> {
     async fn wake_compute(
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
         match self {
             Self::ControlPlane(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Self::ConsoleRedirect(_, info) => Ok(Cached::new_uncached(info.clone())),
             Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
         }
     }
@@ -514,31 +493,6 @@ impl ComputeConnectBackend for Backend<'_, ComputeCredentials, NodeInfo> {
     fn get_keys(&self) -> &ComputeCredentialKeys {
         match self {
             Self::ControlPlane(_, creds) => &creds.keys,
-            Self::ConsoleRedirect(_, _) => &ComputeCredentialKeys::None,
-            Self::Local(_) => &ComputeCredentialKeys::None,
-        }
-    }
-}
-
-#[async_trait::async_trait]
-impl ComputeConnectBackend for Backend<'_, ComputeCredentials, &()> {
-    async fn wake_compute(
-        &self,
-        ctx: &RequestMonitoring,
-    ) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
-        match self {
-            Self::ControlPlane(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Self::ConsoleRedirect(_, ()) => {
-                unreachable!("web auth flow doesn't support waking the compute")
-            }
-            Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
-        }
-    }
-
-    fn get_keys(&self) -> &ComputeCredentialKeys {
-        match self {
-            Self::ControlPlane(_, creds) => &creds.keys,
-            Self::ConsoleRedirect(_, ()) => &ComputeCredentialKeys::None,
             Self::Local(_) => &ComputeCredentialKeys::None,
         }
     }
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index c781af846a..c92ebbc51f 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -291,7 +291,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
 /// auth::Backend is created at proxy startup, and lives forever.
 fn build_auth_backend(
     args: &LocalProxyCliArgs,
-) -> anyhow::Result<&'static auth::Backend<'static, (), ()>> {
+) -> anyhow::Result<&'static auth::Backend<'static, ()>> {
     let auth_backend = proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned(
         LocalBackend::new(args.compute),
     ));
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 3f4c2df809..3c0e66dec3 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -314,7 +314,10 @@ async fn main() -> anyhow::Result<()> {
     let config = build_config(&args)?;
     let auth_backend = build_auth_backend(&args)?;
 
-    info!("Authentication backend: {}", auth_backend);
+    match auth_backend {
+        Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"),
+        Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"),
+    };
     info!("Using region: {}", args.aws_region);
 
     let region_provider =
@@ -461,26 +464,41 @@ async fn main() -> anyhow::Result<()> {
     // client facing tasks. these will exit on error or on cancellation
     // cancellation returns Ok(())
     let mut client_tasks = JoinSet::new();
-    if let Some(proxy_listener) = proxy_listener {
-        client_tasks.spawn(proxy::proxy::task_main(
-            config,
-            auth_backend,
-            proxy_listener,
-            cancellation_token.clone(),
-            cancellation_handler.clone(),
-            endpoint_rate_limiter.clone(),
-        ));
-    }
+    match auth_backend {
+        Either::Left(auth_backend) => {
+            if let Some(proxy_listener) = proxy_listener {
+                client_tasks.spawn(proxy::proxy::task_main(
+                    config,
+                    auth_backend,
+                    proxy_listener,
+                    cancellation_token.clone(),
+                    cancellation_handler.clone(),
+                    endpoint_rate_limiter.clone(),
+                ));
+            }
 
-    if let Some(serverless_listener) = serverless_listener {
-        client_tasks.spawn(serverless::task_main(
-            config,
-            auth_backend,
-            serverless_listener,
-            cancellation_token.clone(),
-            cancellation_handler.clone(),
-            endpoint_rate_limiter.clone(),
-        ));
+            if let Some(serverless_listener) = serverless_listener {
+                client_tasks.spawn(serverless::task_main(
+                    config,
+                    auth_backend,
+                    serverless_listener,
+                    cancellation_token.clone(),
+                    cancellation_handler.clone(),
+                    endpoint_rate_limiter.clone(),
+                ));
+            }
+        }
+        Either::Right(auth_backend) => {
+            if let Some(proxy_listener) = proxy_listener {
+                client_tasks.spawn(proxy::console_redirect_proxy::task_main(
+                    config,
+                    auth_backend,
+                    proxy_listener,
+                    cancellation_token.clone(),
+                    cancellation_handler.clone(),
+                ));
+            }
+        }
     }
 
     client_tasks.spawn(proxy::context::parquet::worker(
@@ -510,7 +528,7 @@ async fn main() -> anyhow::Result<()> {
         ));
     }
 
-    if let auth::Backend::ControlPlane(api, _) = auth_backend {
+    if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend {
         if let proxy::control_plane::provider::ControlPlaneBackend::Management(api) = &**api {
             match (redis_notifications_client, regional_redis_client.clone()) {
                 (None, None) => {}
@@ -663,7 +681,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         webauth_confirmation_timeout: args.webauth_confirmation_timeout,
     };
 
-    let config = Box::leak(Box::new(ProxyConfig {
+    let config = ProxyConfig {
         tls_config,
         metric_collection,
         allow_self_signed_compute: args.allow_self_signed_compute,
@@ -677,7 +695,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         connect_to_compute_retry_config: config::RetryConfig::parse(
             &args.connect_to_compute_retry,
         )?,
-    }));
+    };
+
+    let config = Box::leak(Box::new(config));
 
     tokio::spawn(config.connect_compute_locks.garbage_collect_worker());
 
@@ -687,8 +707,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
 /// auth::Backend is created at proxy startup, and lives forever.
 fn build_auth_backend(
     args: &ProxyCliArgs,
-) -> anyhow::Result<&'static auth::Backend<'static, (), ()>> {
-    let auth_backend = match &args.auth_backend {
+) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> {
+    match &args.auth_backend {
         AuthBackendType::Console => {
             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
             let project_info_cache_config: ProjectInfoCacheOptions =
@@ -738,12 +758,11 @@ fn build_auth_backend(
                 wake_compute_endpoint_rate_limiter,
             );
             let api = control_plane::provider::ControlPlaneBackend::Management(api);
-            auth::Backend::ControlPlane(MaybeOwned::Owned(api), ())
-        }
+            let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
 
-        AuthBackendType::Web => {
-            let url = args.uri.parse()?;
-            auth::Backend::ConsoleRedirect(MaybeOwned::Owned(ConsoleRedirectBackend::new(url)), ())
+            let config = Box::leak(Box::new(auth_backend));
+
+            Ok(Either::Left(config))
         }
 
         #[cfg(feature = "testing")]
@@ -751,11 +770,23 @@ fn build_auth_backend(
             let url = args.auth_endpoint.parse()?;
             let api = control_plane::provider::mock::Api::new(url, !args.is_private_access_proxy);
             let api = control_plane::provider::ControlPlaneBackend::PostgresMock(api);
-            auth::Backend::ControlPlane(MaybeOwned::Owned(api), ())
-        }
-    };
 
-    Ok(Box::leak(Box::new(auth_backend)))
+            let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
+
+            let config = Box::leak(Box::new(auth_backend));
+
+            Ok(Either::Left(config))
+        }
+
+        AuthBackendType::Web => {
+            let url = args.uri.parse()?;
+            let backend = ConsoleRedirectBackend::new(url);
+
+            let config = Box::leak(Box::new(backend));
+
+            Ok(Either::Right(config))
+        }
+    }
 }
 
 #[cfg(test)]
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
new file mode 100644
index 0000000000..9e17976720
--- /dev/null
+++ b/proxy/src/console_redirect_proxy.rs
@@ -0,0 +1,217 @@
+use crate::auth::backend::ConsoleRedirectBackend;
+use crate::config::{ProxyConfig, ProxyProtocolV2};
+use crate::proxy::{
+    prepare_client_connection, run_until_cancelled, ClientRequestError, ErrorSource,
+};
+use crate::{
+    cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal},
+    context::RequestMonitoring,
+    error::ReportableError,
+    metrics::{Metrics, NumClientConnectionsGuard},
+    protocol2::read_proxy_protocol,
+    proxy::handshake::{handshake, HandshakeData},
+};
+use futures::TryFutureExt;
+use std::sync::Arc;
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info, Instrument};
+
+use crate::proxy::{
+    connect_compute::{connect_to_compute, TcpMechanism},
+    passthrough::ProxyPassthrough,
+};
+
+pub async fn task_main(
+    config: &'static ProxyConfig,
+    backend: &'static ConsoleRedirectBackend,
+    listener: tokio::net::TcpListener,
+    cancellation_token: CancellationToken,
+    cancellation_handler: Arc<CancellationHandlerMain>,
+) -> anyhow::Result<()> {
+    scopeguard::defer! {
+        info!("proxy has shut down");
+    }
+
+    // When set for the server socket, the keepalive setting
+    // will be inherited by all accepted client sockets.
+    socket2::SockRef::from(&listener).set_keepalive(true)?;
+
+    let connections = tokio_util::task::task_tracker::TaskTracker::new();
+
+    while let Some(accept_result) =
+        run_until_cancelled(listener.accept(), &cancellation_token).await
+    {
+        let (socket, peer_addr) = accept_result?;
+
+        let conn_gauge = Metrics::get()
+            .proxy
+            .client_connections
+            .guard(crate::metrics::Protocol::Tcp);
+
+        let session_id = uuid::Uuid::new_v4();
+        let cancellation_handler = Arc::clone(&cancellation_handler);
+
+        tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
+
+        connections.spawn(async move {
+            let (socket, peer_addr) = match read_proxy_protocol(socket).await {
+                Err(e) => {
+                    error!("per-client task finished with an error: {e:#}");
+                    return;
+                }
+                Ok((_socket, None)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
+                    error!("missing required proxy protocol header");
+                    return;
+                }
+                Ok((_socket, Some(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
+                    error!("proxy protocol header not supported");
+                    return;
+                }
+                Ok((socket, Some(addr))) => (socket, addr.ip()),
+                Ok((socket, None)) => (socket, peer_addr.ip()),
+            };
+
+            match socket.inner.set_nodelay(true) {
+                Ok(()) => {}
+                Err(e) => {
+                    error!("per-client task finished with an error: failed to set socket option: {e:#}");
+                    return;
+                }
+            };
+
+            let ctx = RequestMonitoring::new(
+                session_id,
+                peer_addr,
+                crate::metrics::Protocol::Tcp,
+                &config.region,
+            );
+            let span = ctx.span();
+
+            let startup = Box::pin(
+                handle_client(
+                    config,
+                    backend,
+                    &ctx,
+                    cancellation_handler,
+                    socket,
+                    conn_gauge,
+                )
+                .instrument(span.clone()),
+            );
+            let res = startup.await;
+
+            match res {
+                Err(e) => {
+                    // todo: log and push to ctx the error kind
+                    ctx.set_error_kind(e.get_error_kind());
+                    error!(parent: &span, "per-client task finished with an error: {e:#}");
+                }
+                Ok(None) => {
+                    ctx.set_success();
+                }
+                Ok(Some(p)) => {
+                    ctx.set_success();
+                    ctx.log_connect();
+                    match p.proxy_pass().instrument(span.clone()).await {
+                        Ok(()) => {}
+                        Err(ErrorSource::Client(e)) => {
+                            error!(parent: &span, "per-client task finished with an IO error from the client: {e:#}");
+                        }
+                        Err(ErrorSource::Compute(e)) => {
+                            error!(parent: &span, "per-client task finished with an IO error from the compute: {e:#}");
+                        }
+                    }
+                }
+            }
+        });
+    }
+
+    connections.close();
+    drop(listener);
+
+    // Drain connections
+    connections.wait().await;
+
+    Ok(())
+}
+
+pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
+    config: &'static ProxyConfig,
+    backend: &'static ConsoleRedirectBackend,
+    ctx: &RequestMonitoring,
+    cancellation_handler: Arc<CancellationHandlerMain>,
+    stream: S,
+    conn_gauge: NumClientConnectionsGuard<'static>,
+) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
+    info!(
+        protocol = %ctx.protocol(),
+        "handling interactive connection from client"
+    );
+
+    let metrics = &Metrics::get().proxy;
+    let proto = ctx.protocol();
+    let request_gauge = metrics.connection_requests.guard(proto);
+
+    let tls = config.tls_config.as_ref();
+
+    let record_handshake_error = !ctx.has_private_peer_addr();
+    let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
+    let do_handshake = handshake(ctx, stream, tls, record_handshake_error);
+    let (mut stream, params) =
+        match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
+            HandshakeData::Startup(stream, params) => (stream, params),
+            HandshakeData::Cancel(cancel_key_data) => {
+                return Ok(cancellation_handler
+                    .cancel_session(cancel_key_data, ctx.session_id())
+                    .await
+                    .map(|()| None)?)
+            }
+        };
+    drop(pause);
+
+    ctx.set_db_options(params.clone());
+
+    let user_info = match backend
+        .authenticate(ctx, &config.authentication_config, &mut stream)
+        .await
+    {
+        Ok(auth_result) => auth_result,
+        Err(e) => {
+            return stream.throw_error(e).await?;
+        }
+    };
+
+    let mut node = connect_to_compute(
+        ctx,
+        &TcpMechanism {
+            params: &params,
+            locks: &config.connect_compute_locks,
+        },
+        &user_info,
+        config.allow_self_signed_compute,
+        config.wake_compute_retry_config,
+        config.connect_to_compute_retry_config,
+    )
+    .or_else(|e| stream.throw_error(e))
+    .await?;
+
+    let session = cancellation_handler.get_session();
+    prepare_client_connection(&node, &session, &mut stream).await?;
+
+    // Before proxy passing, forward to compute whatever data is left in the
+    // PqStream input buffer. Normally there is none, but our serverless npm
+    // driver in pipeline mode sends startup, password and first query
+    // immediately after opening the connection.
+    let (stream, read_buf) = stream.into_inner();
+    node.stream.write_all(&read_buf).await?;
+
+    Ok(Some(ProxyPassthrough {
+        client: stream,
+        aux: node.aux.clone(),
+        compute: node,
+        _req: request_gauge,
+        _conn: conn_gauge,
+        _cancel: session,
+    }))
+}
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 8d274baa10..74bc778a36 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -95,6 +95,7 @@ pub mod cache;
 pub mod cancellation;
 pub mod compute;
 pub mod config;
+pub mod console_redirect_proxy;
 pub mod context;
 pub mod control_plane;
 pub mod error;
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 3a43ccb74a..b2b5a7f43d 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -61,7 +61,7 @@ pub async fn run_until_cancelled<F: std::future::Future>(
 
 pub async fn task_main(
     config: &'static ProxyConfig,
-    auth_backend: &'static auth::Backend<'static, (), ()>,
+    auth_backend: &'static auth::Backend<'static, ()>,
     listener: tokio::net::TcpListener,
     cancellation_token: CancellationToken,
     cancellation_handler: Arc<CancellationHandlerMain>,
@@ -248,7 +248,7 @@ impl ReportableError for ClientRequestError {
 #[allow(clippy::too_many_arguments)]
 pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
-    auth_backend: &'static auth::Backend<'static, (), ()>,
+    auth_backend: &'static auth::Backend<'static, ()>,
     ctx: &RequestMonitoring,
     cancellation_handler: Arc<CancellationHandlerMain>,
     stream: S,
@@ -356,7 +356,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
 /// Finish client connection initialization: confirm auth success, send params, etc.
 #[tracing::instrument(skip_all)]
-async fn prepare_client_connection<P>(
+pub(crate) async fn prepare_client_connection<P>(
     node: &compute::PostgresConnection,
     session: &cancellation::Session<P>,
     stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 3861ddc8ed..58fb36dba7 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -552,7 +552,7 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn
 
 fn helper_create_connect_info(
     mechanism: &TestConnectMechanism,
-) -> auth::Backend<'static, ComputeCredentials, &()> {
+) -> auth::Backend<'static, ComputeCredentials> {
     let user_info = auth::Backend::ControlPlane(
         MaybeOwned::Owned(ControlPlaneBackend::Test(Box::new(mechanism.clone()))),
         ComputeCredentials {
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 9e49478cf3..2b060af9e1 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -42,7 +42,7 @@ pub(crate) struct PoolingBackend {
     pub(crate) local_pool: Arc<LocalConnPool<tokio_postgres::Client>>,
     pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
     pub(crate) config: &'static ProxyConfig,
-    pub(crate) auth_backend: &'static crate::auth::Backend<'static, (), ()>,
+    pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>,
     pub(crate) endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 }
 
@@ -135,9 +135,6 @@ impl PoolingBackend {
                     keys: crate::auth::backend::ComputeCredentialKeys::None,
                 })
             }
-            crate::auth::Backend::ConsoleRedirect(_, ()) => Err(AuthError::auth_failed(
-                "JWT login over web auth proxy is not supported",
-            )),
             crate::auth::Backend::Local(_) => {
                 let keys = self
                     .config
@@ -264,7 +261,7 @@ impl PoolingBackend {
         info!(%conn_id, "local_pool: opening a new connection '{conn_info}'");
 
         let mut node_info = match &self.auth_backend {
-            auth::Backend::ControlPlane(_, ()) | auth::Backend::ConsoleRedirect(_, ()) => {
+            auth::Backend::ControlPlane(_, ()) => {
                 unreachable!("only local_proxy can connect to local postgres")
             }
             auth::Backend::Local(local) => local.node_info.clone(),
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index 95f64e972c..3131adada4 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -55,7 +55,7 @@ pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api";
 
 pub async fn task_main(
     config: &'static ProxyConfig,
-    auth_backend: &'static crate::auth::Backend<'static, (), ()>,
+    auth_backend: &'static crate::auth::Backend<'static, ()>,
     ws_listener: TcpListener,
     cancellation_token: CancellationToken,
     cancellation_handler: Arc<CancellationHandlerMain>,
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index fd0f0cac7f..f5a692cf40 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -129,7 +129,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
 
 pub(crate) async fn serve_websocket(
     config: &'static ProxyConfig,
-    auth_backend: &'static crate::auth::Backend<'static, (), ()>,
+    auth_backend: &'static crate::auth::Backend<'static, ()>,
     ctx: RequestMonitoring,
     websocket: OnUpgrade,
     cancellation_handler: Arc<CancellationHandlerMain>,

From d056ae9be5844b22378f961dd3ae730d96ef996e Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 14 Oct 2024 13:45:20 +0300
Subject: [PATCH 02/57] Ignore pg_dynshmem fiel when comparing directories
 (#9374)

## Problem

At MacOS `pg_dynshmem` file is create in PGDATADIR which cause mismatch
in directories comparison

## Summary of changes

Add this files to the ignore list.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 7789855fe4..059707c8ed 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4280,6 +4280,7 @@ SKIP_FILES = frozenset(
         "postmaster.opts",
         "postmaster.pid",
         "pg_control",
+        "pg_dynshmem",
     )
 )
 

From 31b7703fa87fa7fdc4d3a9f8b8f223cfddc0cd1a Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 14 Oct 2024 11:51:01 +0100
Subject: [PATCH 03/57] CI(build-build-tools): fix unexpected cancellations
 (#9357)

## Problem
When `Dockerfile.build-tools` gets changed, several PRs catch up with
it and some might get unexpectedly cancelled workflows because of
GitHub's concurrency model for workflows.
See the comment in the code for more details.

It should be possible to revert it after
https://github.com/orgs/community/discussions/41518 (I don't expect it
anytime soon, but I subscribed)

## Summary of changes
- Do not queue `build-build-tools-image` workflows in the concurrency
group
---
 .github/workflows/build-build-tools-image.yml | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index ca5ff573e1..130753833d 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -19,9 +19,16 @@ defaults:
   run:
     shell: bash -euo pipefail {0}
 
-concurrency:
-  group: build-build-tools-image-${{ inputs.image-tag }}
-  cancel-in-progress: false
+# The initial idea was to prevent the waste of resources by not re-building the `build-tools` image
+# for the same tag in parallel workflow runs, and queue them to be skipped once we have
+# the first image pushed to Docker registry, but GitHub's concurrency mechanism is not working as expected.
+# GitHub can't have more than 1 job in a queue and removes the previous one, it causes failures if the dependent jobs.
+#
+# Ref https://github.com/orgs/community/discussions/41518
+#
+# concurrency:
+#   group: build-build-tools-image-${{ inputs.image-tag }}
+#   cancel-in-progress: false
 
 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}

From d92ff578c4a738a52bdcb0a6f44af7691a64882c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 14 Oct 2024 14:34:57 +0200
Subject: [PATCH 04/57] Add test for fixed storage broker issue (#9311)

Adds a test for the (now fixed) storage broker limit issue, see #9268
for the description and #9299 for the fix.

Also fix a race condition with endpoint creation/starts running in parallel,
leading to file not found errors.
---
 control_plane/src/endpoint.rs       | 16 +++++++++++++-
 test_runner/regress/test_tenants.py | 34 ++++++++++++++++++++++++++++-
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 7cdf621737..71514daa7c 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -97,7 +97,21 @@ impl ComputeControlPlane {
         for endpoint_dir in std::fs::read_dir(env.endpoints_path())
             .with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
         {
-            let ep = Endpoint::from_dir_entry(endpoint_dir?, &env)?;
+            let ep_res = Endpoint::from_dir_entry(endpoint_dir?, &env);
+            let ep = match ep_res {
+                Ok(ep) => ep,
+                Err(e) => match e.downcast::<std::io::Error>() {
+                    Ok(e) => {
+                        // A parallel task could delete an endpoint while we have just scanned the directory
+                        if e.kind() == std::io::ErrorKind::NotFound {
+                            continue;
+                        } else {
+                            Err(e)?
+                        }
+                    }
+                    Err(e) => Err(e)?,
+                },
+            };
             endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep));
         }
 
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 95dc0fec78..4a16535941 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
 import concurrent.futures
 import os
+import threading
 import time
 from contextlib import closing
 from datetime import datetime
@@ -10,7 +11,7 @@ from pathlib import Path
 
 import pytest
 import requests
-from fixtures.common_types import Lsn, TenantId
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import (
     PAGESERVER_GLOBAL_METRICS,
@@ -476,3 +477,34 @@ def test_pageserver_metrics_many_relations(neon_env_builder: NeonEnvBuilder):
     assert counts
     log.info(f"directory counts: {counts}")
     assert counts[2] > COUNT_AT_LEAST_EXPECTED
+
+
+def test_timelines_parallel_endpoints(neon_simple_env: NeonEnv):
+    """
+    (Relaxed) regression test for issue that led to https://github.com/neondatabase/neon/pull/9268
+    Create many endpoints in parallel and then restart them
+    """
+    env = neon_simple_env
+
+    # This param needs to be 200+ to reproduce the limit issue
+    n_threads = 16
+    barrier = threading.Barrier(n_threads)
+
+    def test_timeline(branch_name: str, timeline_id: TimelineId):
+        endpoint = env.endpoints.create_start(branch_name)
+        endpoint.stop()
+        # Use a barrier to make sure we restart endpoints at the same time
+        barrier.wait()
+        endpoint.start()
+
+    workers = []
+
+    for i in range(0, n_threads):
+        branch_name = f"branch_{i}"
+        timeline_id = env.create_branch(branch_name)
+        w = threading.Thread(target=test_timeline, args=[branch_name, timeline_id])
+        workers.append(w)
+        w.start()
+
+    for w in workers:
+        w.join()

From f4f7ea247c05a56a90e4a7f99249133c58c8c443 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 14 Oct 2024 16:50:12 +0100
Subject: [PATCH 05/57] tests: make size comparisons more lenient (#9388)

The empirically determined threshold doesn't hold for PG 17.
Bump the limit to stabilise ci.
---
 test_runner/regress/test_tenant_size.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 9ea09d10d7..b41f1709bd 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -479,9 +479,9 @@ def assert_size_approx_equal(size_a, size_b):
     """
 
     # Determined empirically from examples of equality failures: they differ
-    # by page multiples of 8272, and usually by 1-3 pages.  Tolerate 4 to avoid
+    # by page multiples of 8272, and usually by 1-3 pages.  Tolerate 6 to avoid
     # failing on outliers from that observed range.
-    threshold = 4 * 8272
+    threshold = 6 * 8272
 
     assert size_a == pytest.approx(size_b, abs=threshold)
 

From f54e3e9147bf1dd341e22a0fc01cf5c5d71843e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 14 Oct 2024 17:54:03 +0200
Subject: [PATCH 06/57] Also consider offloaded timelines for obtaining
 retain_lsn (#9308)

Also consider offloaded timelines for obtaining `retain_lsn`. This is
required for correctness for all timelines that have not been flattened
yet: otherwise we GC data that might still be required for reading.

This somewhat counteracts the original purpose of timeline offloading of
not having to iterate over offloaded timelines, but sadly it's required.
In the future, we can improve the way the offloaded timelines are
stored.

We also make the `retain_lsn` optional so that in the future, when we
implement flattening, we can make it None. This also applies to full
timeline objects by the way, where it would probably make most sense to
add a bool flag whether the timeline is successfully flattened, and if
it is, one can exclude it from `retain_lsn` as well.

Also, track whether a timeline was offloaded or not in `retain_lsn` so
that the `retain_lsn` can be excluded from visibility and size
calculation.

Part of #8088
---
 pageserver/src/tenant.rs                     | 56 ++++++++++++++++----
 pageserver/src/tenant/size.rs                |  8 +--
 pageserver/src/tenant/timeline.rs            | 21 +++++---
 pageserver/src/tenant/timeline/compaction.rs |  9 ++--
 4 files changed, 71 insertions(+), 23 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d2818d04dc..397778d4c8 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -493,6 +493,8 @@ pub struct OffloadedTimeline {
     pub tenant_shard_id: TenantShardId,
     pub timeline_id: TimelineId,
     pub ancestor_timeline_id: Option<TimelineId>,
+    /// Whether to retain the branch lsn at the ancestor or not
+    pub ancestor_retain_lsn: Option<Lsn>,
 
     // TODO: once we persist offloaded state, make this lazily constructed
     pub remote_client: Arc<RemoteTimelineClient>,
@@ -504,10 +506,14 @@ pub struct OffloadedTimeline {
 
 impl OffloadedTimeline {
     fn from_timeline(timeline: &Timeline) -> Self {
+        let ancestor_retain_lsn = timeline
+            .get_ancestor_timeline_id()
+            .map(|_timeline_id| timeline.get_ancestor_lsn());
         Self {
             tenant_shard_id: timeline.tenant_shard_id,
             timeline_id: timeline.timeline_id,
             ancestor_timeline_id: timeline.get_ancestor_timeline_id(),
+            ancestor_retain_lsn,
 
             remote_client: timeline.remote_client.clone(),
             delete_progress: timeline.delete_progress.clone(),
@@ -515,6 +521,12 @@ impl OffloadedTimeline {
     }
 }
 
+#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
+pub enum MaybeOffloaded {
+    Yes,
+    No,
+}
+
 #[derive(Clone)]
 pub enum TimelineOrOffloaded {
     Timeline(Arc<Timeline>),
@@ -2253,12 +2265,13 @@ impl Tenant {
 
         if activating {
             let timelines_accessor = self.timelines.lock().unwrap();
+            let timelines_offloaded_accessor = self.timelines_offloaded.lock().unwrap();
             let timelines_to_activate = timelines_accessor
                 .values()
                 .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping()));
 
             // Before activation, populate each Timeline's GcInfo with information about its children
-            self.initialize_gc_info(&timelines_accessor);
+            self.initialize_gc_info(&timelines_accessor, &timelines_offloaded_accessor);
 
             // Spawn gc and compaction loops. The loops will shut themselves
             // down when they notice that the tenant is inactive.
@@ -3298,6 +3311,7 @@ impl Tenant {
     fn initialize_gc_info(
         &self,
         timelines: &std::sync::MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
+        timelines_offloaded: &std::sync::MutexGuard<HashMap<TimelineId, Arc<OffloadedTimeline>>>,
     ) {
         // This function must be called before activation: after activation timeline create/delete operations
         // might happen, and this function is not safe to run concurrently with those.
@@ -3305,20 +3319,37 @@ impl Tenant {
 
         // Scan all timelines. For each timeline, remember the timeline ID and
         // the branch point where it was created.
-        let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId)>> = BTreeMap::new();
+        let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId, MaybeOffloaded)>> =
+            BTreeMap::new();
         timelines.iter().for_each(|(timeline_id, timeline_entry)| {
             if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() {
                 let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default();
-                ancestor_children.push((timeline_entry.get_ancestor_lsn(), *timeline_id));
+                ancestor_children.push((
+                    timeline_entry.get_ancestor_lsn(),
+                    *timeline_id,
+                    MaybeOffloaded::No,
+                ));
             }
         });
+        timelines_offloaded
+            .iter()
+            .for_each(|(timeline_id, timeline_entry)| {
+                let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id else {
+                    return;
+                };
+                let Some(retain_lsn) = timeline_entry.ancestor_retain_lsn else {
+                    return;
+                };
+                let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default();
+                ancestor_children.push((retain_lsn, *timeline_id, MaybeOffloaded::Yes));
+            });
 
         // The number of bytes we always keep, irrespective of PITR: this is a constant across timelines
         let horizon = self.get_gc_horizon();
 
         // Populate each timeline's GcInfo with information about its child branches
         for timeline in timelines.values() {
-            let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints
+            let mut branchpoints: Vec<(Lsn, TimelineId, MaybeOffloaded)> = all_branchpoints
                 .remove(&timeline.timeline_id)
                 .unwrap_or_default();
 
@@ -4878,7 +4909,10 @@ mod tests {
         {
             let branchpoints = &tline.gc_info.read().unwrap().retain_lsns;
             assert_eq!(branchpoints.len(), 1);
-            assert_eq!(branchpoints[0], (Lsn(0x40), NEW_TIMELINE_ID));
+            assert_eq!(
+                branchpoints[0],
+                (Lsn(0x40), NEW_TIMELINE_ID, MaybeOffloaded::No)
+            );
         }
 
         // You can read the key from the child branch even though the parent is
@@ -8261,8 +8295,8 @@ mod tests {
             let mut guard = tline.gc_info.write().unwrap();
             *guard = GcInfo {
                 retain_lsns: vec![
-                    (Lsn(0x10), tline.timeline_id),
-                    (Lsn(0x20), tline.timeline_id),
+                    (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No),
+                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
                 ],
                 cutoffs: GcCutoffs {
                     time: Lsn(0x30),
@@ -8489,8 +8523,8 @@ mod tests {
             let mut guard = tline.gc_info.write().unwrap();
             *guard = GcInfo {
                 retain_lsns: vec![
-                    (Lsn(0x10), tline.timeline_id),
-                    (Lsn(0x20), tline.timeline_id),
+                    (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No),
+                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
                 ],
                 cutoffs: GcCutoffs {
                     time: Lsn(0x30),
@@ -8723,7 +8757,7 @@ mod tests {
             // Update GC info
             let mut guard = parent_tline.gc_info.write().unwrap();
             *guard = GcInfo {
-                retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)],
+                retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id, MaybeOffloaded::No)],
                 cutoffs: GcCutoffs {
                     time: Lsn(0x10),
                     space: Lsn(0x10),
@@ -8737,7 +8771,7 @@ mod tests {
             // Update GC info
             let mut guard = branch_tline.gc_info.write().unwrap();
             *guard = GcInfo {
-                retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)],
+                retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id, MaybeOffloaded::No)],
                 cutoffs: GcCutoffs {
                     time: Lsn(0x50),
                     space: Lsn(0x50),
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 41d558d3f6..4a4c698b56 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -12,7 +12,7 @@ use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 
 use super::{GcError, LogicalSizeCalculationCause, Tenant};
-use crate::tenant::Timeline;
+use crate::tenant::{MaybeOffloaded, Timeline};
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
 
@@ -264,10 +264,12 @@ pub(super) async fn gather_inputs(
         let mut lsns: Vec<(Lsn, LsnKind)> = gc_info
             .retain_lsns
             .iter()
-            .filter(|(lsn, _child_id)| lsn > &ancestor_lsn)
+            .filter(|(lsn, _child_id, is_offloaded)| {
+                lsn > &ancestor_lsn && *is_offloaded == MaybeOffloaded::No
+            })
             .copied()
             // this assumes there are no other retain_lsns than the branchpoints
-            .map(|(lsn, _child_id)| (lsn, LsnKind::BranchPoint))
+            .map(|(lsn, _child_id, _is_offloaded)| (lsn, LsnKind::BranchPoint))
             .collect::<Vec<_>>();
 
         lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2fd4e699cf..8f098d0e82 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -139,8 +139,10 @@ use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
 use super::{
-    config::TenantConf, storage_layer::inmemory_layer, storage_layer::LayerVisibilityHint,
+    config::TenantConf,
+    storage_layer::{inmemory_layer, LayerVisibilityHint},
     upload_queue::NotInitialized,
+    MaybeOffloaded,
 };
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
@@ -450,7 +452,7 @@ pub(crate) struct GcInfo {
     /// Currently, this includes all points where child branches have
     /// been forked off from. In the future, could also include
     /// explicit user-defined snapshot points.
-    pub(crate) retain_lsns: Vec<(Lsn, TimelineId)>,
+    pub(crate) retain_lsns: Vec<(Lsn, TimelineId, MaybeOffloaded)>,
 
     /// The cutoff coordinates, which are combined by selecting the minimum.
     pub(crate) cutoffs: GcCutoffs,
@@ -467,8 +469,13 @@ impl GcInfo {
         self.cutoffs.select_min()
     }
 
-    pub(super) fn insert_child(&mut self, child_id: TimelineId, child_lsn: Lsn) {
-        self.retain_lsns.push((child_lsn, child_id));
+    pub(super) fn insert_child(
+        &mut self,
+        child_id: TimelineId,
+        child_lsn: Lsn,
+        is_offloaded: MaybeOffloaded,
+    ) {
+        self.retain_lsns.push((child_lsn, child_id, is_offloaded));
         self.retain_lsns.sort_by_key(|i| i.0);
     }
 
@@ -2164,7 +2171,9 @@ impl Timeline {
 
         if let Some(ancestor) = &ancestor {
             let mut ancestor_gc_info = ancestor.gc_info.write().unwrap();
-            ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn());
+            // If we construct an explicit timeline object, it's obviously not offloaded
+            let is_offloaded = MaybeOffloaded::No;
+            ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn(), is_offloaded);
         }
 
         Arc::new_cyclic(|myself| {
@@ -4875,7 +4884,7 @@ impl Timeline {
             let retain_lsns = gc_info
                 .retain_lsns
                 .iter()
-                .map(|(lsn, _child_id)| *lsn)
+                .map(|(lsn, _child_id, _is_offloaded)| *lsn)
                 .collect();
 
             // Gets the maximum LSN that holds the valid lease.
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 9f64471432..8b9ace1e5b 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -42,7 +42,7 @@ use crate::tenant::storage_layer::{
 use crate::tenant::timeline::ImageLayerCreationOutcome;
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
-use crate::tenant::DeltaLayer;
+use crate::tenant::{DeltaLayer, MaybeOffloaded};
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 use pageserver_api::config::tenant_conf_defaults::{
     DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
@@ -639,7 +639,10 @@ impl Timeline {
             let children = self.gc_info.read().unwrap().retain_lsns.clone();
 
             let mut readable_points = Vec::with_capacity(children.len() + 1);
-            for (child_lsn, _child_timeline_id) in &children {
+            for (child_lsn, _child_timeline_id, is_offloaded) in &children {
+                if *is_offloaded == MaybeOffloaded::Yes {
+                    continue;
+                }
                 readable_points.push(*child_lsn);
             }
             readable_points.push(head_lsn);
@@ -1741,7 +1744,7 @@ impl Timeline {
             let gc_info = self.gc_info.read().unwrap();
             let mut retain_lsns_below_horizon = Vec::new();
             let gc_cutoff = gc_info.cutoffs.select_min();
-            for (lsn, _timeline_id) in &gc_info.retain_lsns {
+            for (lsn, _timeline_id, _is_offloaded) in &gc_info.retain_lsns {
                 if lsn < &gc_cutoff {
                     retain_lsns_below_horizon.push(*lsn);
                 }

From dab96a6eb159ffa34ff98f8dfc3b2a6862441e02 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Mon, 14 Oct 2024 20:30:21 +0200
Subject: [PATCH 07/57] Add more timing histogram and gauge metrics to the Neon
 extension (#9116)

We now also track:

- Number of PS IOs in-flight
- Number of pages cached by smgr prefetch implementation
- IO timing histograms for LFC reads and writes, per IO issued

## Problem

There's little insight into the timing metrics of LFC, and what the
prefetch state of each backend is.

This changes that, by measuring (and subsequently exposing) these data
points.

## Summary of changes

- Extract IOHistogram as separate type, rather than a collection of
fields on NeonMetrics
- others, see items above.

Part of https://github.com/neondatabase/neon/issues/8926
---
 pgxn/neon/file_cache.c         |  27 ++++-
 pgxn/neon/neon_perf_counters.c | 174 +++++++++++++++++++++------------
 pgxn/neon/neon_perf_counters.h |  42 ++++++--
 pgxn/neon/pagestore_smgr.c     |  35 +++++++
 4 files changed, 205 insertions(+), 73 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index d789526050..bbea5a8b0d 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -43,6 +43,7 @@
 #include "hll.h"
 #include "bitmap.h"
 #include "neon.h"
+#include "neon_perf_counters.h"
 
 #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)
 
@@ -114,7 +115,9 @@ typedef struct FileCacheControl
 	uint32		limit;			/* shared copy of lfc_size_limit */
 	uint64		hits;
 	uint64		misses;
-	uint64		writes;
+	uint64		writes;			/* number of writes issued */
+	uint64		time_read;		/* time spent reading (us) */
+	uint64		time_write;		/* time spent writing (us) */
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
 	dlist_head  holes;          /* double linked list of punched holes */
@@ -270,6 +273,8 @@ lfc_shmem_startup(void)
 		lfc_ctl->hits = 0;
 		lfc_ctl->misses = 0;
 		lfc_ctl->writes = 0;
+		lfc_ctl->time_read = 0;
+		lfc_ctl->time_write = 0;
 		dlist_init(&lfc_ctl->lru);
 		dlist_init(&lfc_ctl->holes);
 
@@ -701,6 +706,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		int		blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK));
 		int		iteration_hits = 0;
 		int		iteration_misses = 0;
+		uint64	io_time_us = 0;
 		Assert(blocks_in_chunk > 0);
 
 		for (int i = 0; i < blocks_in_chunk; i++)
@@ -795,6 +801,13 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			lfc_ctl->misses += iteration_misses;
 			pgBufferUsage.file_cache.hits += iteration_hits;
 			pgBufferUsage.file_cache.misses += iteration_misses;
+
+			if (iteration_hits)
+			{
+				lfc_ctl->time_read += io_time_us;
+				inc_page_cache_read_wait(io_time_us);
+			}
+
 			CriticalAssert(entry->access_count > 0);
 			if (--entry->access_count == 0)
 				dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
@@ -859,6 +872,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		struct iovec iov[PG_IOV_MAX];
 		int		chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
 		int		blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK));
+		instr_time io_start, io_end;
 		Assert(blocks_in_chunk > 0);
 
 		for (int i = 0; i < blocks_in_chunk; i++)
@@ -947,12 +961,13 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 		generation = lfc_ctl->generation;
 		entry_offset = entry->offset;
-		lfc_ctl->writes += blocks_in_chunk;
 		LWLockRelease(lfc_lock);
 
 		pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
+		INSTR_TIME_SET_CURRENT(io_start);
 		rc = pwritev(lfc_desc, iov, blocks_in_chunk,
 					 ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
+		INSTR_TIME_SET_CURRENT(io_end);
 		pgstat_report_wait_end();
 
 		if (rc != BLCKSZ * blocks_in_chunk)
@@ -965,9 +980,17 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 			if (lfc_ctl->generation == generation)
 			{
+				uint64	time_spent_us;
 				CriticalAssert(LFC_ENABLED());
 				/* Place entry to the head of LRU list */
 				CriticalAssert(entry->access_count > 0);
+
+				lfc_ctl->writes += blocks_in_chunk;
+				INSTR_TIME_SUBTRACT(io_start, io_end);
+				time_spent_us = INSTR_TIME_GET_MICROSEC(io_start);
+				lfc_ctl->time_write += time_spent_us;
+				inc_page_cache_write_wait(time_spent_us);
+
 				if (--entry->access_count == 0)
 					dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
 
diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c
index a497d387c8..05db187076 100644
--- a/pgxn/neon/neon_perf_counters.c
+++ b/pgxn/neon/neon_perf_counters.c
@@ -50,28 +50,52 @@ NeonPerfCountersShmemInit(void)
 	}
 }
 
-/*
- * Count a GetPage wait operation.
- */
-void
-inc_getpage_wait(uint64 latency_us)
+static inline void
+inc_iohist(IOHistogram hist, uint64 latency_us)
 {
 	int			lo = 0;
-	int			hi = NUM_GETPAGE_WAIT_BUCKETS - 1;
+	int			hi = NUM_IO_WAIT_BUCKETS - 1;
 
 	/* Find the right bucket with binary search */
 	while (lo < hi)
 	{
 		int			mid = (lo + hi) / 2;
 
-		if (latency_us < getpage_wait_bucket_thresholds[mid])
+		if (latency_us < io_wait_bucket_thresholds[mid])
 			hi = mid;
 		else
 			lo = mid + 1;
 	}
-	MyNeonCounters->getpage_wait_us_bucket[lo]++;
-	MyNeonCounters->getpage_wait_us_sum += latency_us;
-	MyNeonCounters->getpage_wait_us_count++;
+	hist->wait_us_bucket[lo]++;
+	hist->wait_us_sum += latency_us;
+	hist->wait_us_count++;
+}
+
+/*
+ * Count a GetPage wait operation.
+ */
+void
+inc_getpage_wait(uint64 latency)
+{
+	inc_iohist(&MyNeonCounters->getpage_hist, latency);
+}
+
+/*
+ * Count an LFC read wait operation.
+ */
+void
+inc_page_cache_read_wait(uint64 latency)
+{
+	inc_iohist(&MyNeonCounters->file_cache_read_hist, latency);
+}
+
+/*
+ * Count an LFC write wait operation.
+ */
+void
+inc_page_cache_write_wait(uint64 latency)
+{
+	inc_iohist(&MyNeonCounters->file_cache_write_hist, latency);
 }
 
 /*
@@ -81,77 +105,91 @@ inc_getpage_wait(uint64 latency_us)
 
 typedef struct
 {
-	char	   *name;
+	const char *name;
 	bool		is_bucket;
 	double		bucket_le;
 	double		value;
 } metric_t;
 
-static metric_t *
-neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
+static int
+histogram_to_metrics(IOHistogram histogram,
+					 metric_t *metrics,
+					 const char *count,
+					 const char *sum,
+					 const char *bucket)
 {
-#define NUM_METRICS (2 + NUM_GETPAGE_WAIT_BUCKETS + 8)
-	metric_t   *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t));
-	uint64		bucket_accum;
-	int			i = 0;
+	int		i = 0;
+	uint64	bucket_accum = 0;
 
-	metrics[i].name = "getpage_wait_seconds_count";
+	metrics[i].name = count;
 	metrics[i].is_bucket = false;
-	metrics[i].value = (double) counters->getpage_wait_us_count;
+	metrics[i].value = (double) histogram->wait_us_count;
 	i++;
-	metrics[i].name = "getpage_wait_seconds_sum";
+	metrics[i].name = sum;
 	metrics[i].is_bucket = false;
-	metrics[i].value = ((double) counters->getpage_wait_us_sum) / 1000000.0;
+	metrics[i].value = (double) histogram->wait_us_sum / 1000000.0;
 	i++;
-
-	bucket_accum = 0;
-	for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++)
+	for (int bucketno = 0; bucketno < NUM_IO_WAIT_BUCKETS; bucketno++)
 	{
-		uint64		threshold = getpage_wait_bucket_thresholds[bucketno];
+		uint64		threshold = io_wait_bucket_thresholds[bucketno];
 
-		bucket_accum += counters->getpage_wait_us_bucket[bucketno];
+		bucket_accum += histogram->wait_us_bucket[bucketno];
 
-		metrics[i].name = "getpage_wait_seconds_bucket";
+		metrics[i].name = bucket;
 		metrics[i].is_bucket = true;
 		metrics[i].bucket_le = (threshold == UINT64_MAX) ? INFINITY : ((double) threshold) / 1000000.0;
 		metrics[i].value = (double) bucket_accum;
 		i++;
 	}
-	metrics[i].name = "getpage_prefetch_requests_total";
-	metrics[i].is_bucket = false;
-	metrics[i].value = (double) counters->getpage_prefetch_requests_total;
-	i++;
-	metrics[i].name = "getpage_sync_requests_total";
-	metrics[i].is_bucket = false;
-	metrics[i].value = (double) counters->getpage_sync_requests_total;
-	i++;
-	metrics[i].name = "getpage_prefetch_misses_total";
-	metrics[i].is_bucket = false;
-	metrics[i].value = (double) counters->getpage_prefetch_misses_total;
-	i++;
-	metrics[i].name = "getpage_prefetch_discards_total";
-	metrics[i].is_bucket = false;
-	metrics[i].value = (double) counters->getpage_prefetch_discards_total;
-	i++;
-	metrics[i].name = "pageserver_requests_sent_total";
-	metrics[i].is_bucket = false;
-	metrics[i].value = (double) counters->pageserver_requests_sent_total;
-	i++;
-	metrics[i].name = "pageserver_disconnects_total";
-	metrics[i].is_bucket = false;
-	metrics[i].value = (double) counters->pageserver_disconnects_total;
-	i++;
-	metrics[i].name = "pageserver_send_flushes_total";
-	metrics[i].is_bucket = false;
-	metrics[i].value = (double) counters->pageserver_send_flushes_total;
-	i++;
-	metrics[i].name = "file_cache_hits_total";
-	metrics[i].is_bucket = false;
-	metrics[i].value = (double) counters->file_cache_hits_total;
-	i++;
+
+	return i;
+}
+
+static metric_t *
+neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
+{
+#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 10)
+	metric_t   *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t));
+	int			i = 0;
+
+#define APPEND_METRIC(_name) do { \
+		metrics[i].name = #_name; \
+		metrics[i].is_bucket = false; \
+		metrics[i].value = (double) counters->_name; \
+		i++; \
+	} while (false)
+
+	i += histogram_to_metrics(&counters->getpage_hist, &metrics[i],
+							  "getpage_wait_seconds_count",
+							  "getpage_wait_seconds_sum",
+							  "getpage_wait_seconds_bucket");
+
+	APPEND_METRIC(getpage_prefetch_requests_total);
+	APPEND_METRIC(getpage_sync_requests_total);
+	APPEND_METRIC(getpage_prefetch_misses_total);
+	APPEND_METRIC(getpage_prefetch_discards_total);
+	APPEND_METRIC(pageserver_requests_sent_total);
+	APPEND_METRIC(pageserver_disconnects_total);
+	APPEND_METRIC(pageserver_send_flushes_total);
+	APPEND_METRIC(pageserver_open_requests);
+	APPEND_METRIC(getpage_prefetches_buffered);
+
+	APPEND_METRIC(file_cache_hits_total);
+
+	i += histogram_to_metrics(&counters->file_cache_read_hist, &metrics[i],
+							  "file_cache_read_wait_seconds_count",
+							  "file_cache_read_wait_seconds_sum",
+							  "file_cache_read_wait_seconds_bucket");
+	i += histogram_to_metrics(&counters->file_cache_write_hist, &metrics[i],
+							  "file_cache_write_wait_seconds_count",
+							  "file_cache_write_wait_seconds_sum",
+							  "file_cache_write_wait_seconds_bucket");
 
 	Assert(i == NUM_METRICS);
 
+#undef APPEND_METRIC
+#undef NUM_METRICS
+
 	/* NULL entry marks end of array */
 	metrics[i].name = NULL;
 	metrics[i].value = 0;
@@ -216,6 +254,15 @@ neon_get_backend_perf_counters(PG_FUNCTION_ARGS)
 	return (Datum) 0;
 }
 
+static inline void
+histogram_merge_into(IOHistogram into, IOHistogram from)
+{
+	into->wait_us_count += from->wait_us_count;
+	into->wait_us_sum += from->wait_us_sum;
+	for (int bucketno = 0; bucketno < NUM_IO_WAIT_BUCKETS; bucketno++)
+		into->wait_us_bucket[bucketno] += from->wait_us_bucket[bucketno];
+}
+
 PG_FUNCTION_INFO_V1(neon_get_perf_counters);
 Datum
 neon_get_perf_counters(PG_FUNCTION_ARGS)
@@ -234,10 +281,7 @@ neon_get_perf_counters(PG_FUNCTION_ARGS)
 	{
 		neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno];
 
-		totals.getpage_wait_us_count += counters->getpage_wait_us_count;
-		totals.getpage_wait_us_sum += counters->getpage_wait_us_sum;
-		for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++)
-			totals.getpage_wait_us_bucket[bucketno] += counters->getpage_wait_us_bucket[bucketno];
+		histogram_merge_into(&totals.getpage_hist, &counters->getpage_hist);
 		totals.getpage_prefetch_requests_total += counters->getpage_prefetch_requests_total;
 		totals.getpage_sync_requests_total += counters->getpage_sync_requests_total;
 		totals.getpage_prefetch_misses_total += counters->getpage_prefetch_misses_total;
@@ -245,7 +289,11 @@ neon_get_perf_counters(PG_FUNCTION_ARGS)
 		totals.pageserver_requests_sent_total += counters->pageserver_requests_sent_total;
 		totals.pageserver_disconnects_total += counters->pageserver_disconnects_total;
 		totals.pageserver_send_flushes_total += counters->pageserver_send_flushes_total;
+		totals.pageserver_open_requests += counters->pageserver_open_requests;
+		totals.getpage_prefetches_buffered += counters->getpage_prefetches_buffered;
 		totals.file_cache_hits_total += counters->file_cache_hits_total;
+		histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist);
+		histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist);
 	}
 
 	metrics = neon_perf_counters_to_metrics(&totals);
diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h
index 49d477c4f8..8edc658a30 100644
--- a/pgxn/neon/neon_perf_counters.h
+++ b/pgxn/neon/neon_perf_counters.h
@@ -15,17 +15,26 @@
 #include "storage/proc.h"
 #endif
 
-static const uint64 getpage_wait_bucket_thresholds[] = {
-	      20,       30,       60,       100,  /* 0      -  100 us */
+static const uint64 io_wait_bucket_thresholds[] = {
+	       2,        3,        6,        10,  /* 0 us   - 10 us */
+	      20,       30,       60,       100,  /* 10 us  - 100 us */
 	     200,      300,      600,	   1000,  /* 100 us - 1 ms */
 	    2000,     3000,     6000,     10000,  /* 1 ms   - 10 ms */
 	   20000,    30000,    60000,    100000,  /* 10 ms  - 100 ms */
 	  200000,   300000,   600000,   1000000,  /* 100 ms - 1 s */
 	 2000000,  3000000,  6000000,  10000000,  /* 1 s - 10 s */
-    20000000, 30000000, 60000000, 100000000,  /* 10 s - 100 s */
 	UINT64_MAX,
 };
-#define NUM_GETPAGE_WAIT_BUCKETS (lengthof(getpage_wait_bucket_thresholds))
+#define NUM_IO_WAIT_BUCKETS (lengthof(io_wait_bucket_thresholds))
+
+typedef struct IOHistogramData
+{
+	uint64		wait_us_count;
+	uint64		wait_us_sum;
+	uint64		wait_us_bucket[NUM_IO_WAIT_BUCKETS];
+} IOHistogramData;
+
+typedef IOHistogramData *IOHistogram;
 
 typedef struct
 {
@@ -39,9 +48,7 @@ typedef struct
 	 * the backend, but the 'neon_backend_perf_counters' view will convert
 	 * them to seconds, to make them more idiomatic as prometheus metrics.
 	 */
-	uint64		getpage_wait_us_count;
-	uint64		getpage_wait_us_sum;
-	uint64		getpage_wait_us_bucket[NUM_GETPAGE_WAIT_BUCKETS];
+	IOHistogramData getpage_hist;
 
 	/*
 	 * Total number of speculative prefetch Getpage requests and synchronous
@@ -50,7 +57,11 @@ typedef struct
 	uint64		getpage_prefetch_requests_total;
 	uint64		getpage_sync_requests_total;
 
-	/* XXX: It's not clear to me when these misses happen. */
+	/*
+	 * Total number of readahead misses; consisting of either prefetches that
+	 * don't satisfy the LSN bounds, or cases where no readahead was issued
+	 * for the read.
+	 */
 	uint64		getpage_prefetch_misses_total;
 
 	/*
@@ -80,6 +91,16 @@ typedef struct
 	 * this can be smaller than pageserver_requests_sent_total.
 	 */
 	uint64		pageserver_send_flushes_total;
+	
+	/*
+	 * Number of open requests to PageServer.
+	 */
+	uint64		pageserver_open_requests;
+
+	/*
+	 * Number of unused prefetches currently cached in this backend.
+	 */
+	uint64		getpage_prefetches_buffered;
 
 	/*
 	 * Number of requests satisfied from the LFC.
@@ -91,6 +112,9 @@ typedef struct
 	 */
 	uint64		file_cache_hits_total;
 
+	/* LFC I/O time buckets */
+	IOHistogramData file_cache_read_hist;
+	IOHistogramData file_cache_write_hist;
 } neon_per_backend_counters;
 
 /* Pointer to the shared memory array of neon_per_backend_counters structs */
@@ -111,6 +135,8 @@ extern neon_per_backend_counters *neon_per_backend_counters_shared;
 #endif
 
 extern void inc_getpage_wait(uint64 latency);
+extern void inc_page_cache_read_wait(uint64 latency);
+extern void inc_page_cache_write_wait(uint64 latency);
 
 extern Size NeonPerfCountersShmemSize(void);
 extern void NeonPerfCountersShmemInit(void);
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 3d9d9285df..f46df7f70a 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -488,6 +488,11 @@ readahead_buffer_resize(int newsize, void *extra)
 		newPState->n_unused -= 1;
 	}
 
+	MyNeonCounters->getpage_prefetches_buffered =
+		MyPState->n_responses_buffered;
+	MyNeonCounters->pageserver_open_requests =
+		MyPState->n_requests_inflight;
+
 	for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
 	{
 		prefetch_set_unused(end);
@@ -621,6 +626,8 @@ prefetch_read(PrefetchRequest *slot)
 		MyPState->n_responses_buffered += 1;
 		MyPState->n_requests_inflight -= 1;
 		MyPState->ring_receive += 1;
+		MyNeonCounters->getpage_prefetches_buffered =
+			MyPState->n_responses_buffered;
 
 		/* update slot state */
 		slot->status = PRFS_RECEIVED;
@@ -674,6 +681,15 @@ prefetch_on_ps_disconnect(void)
 
 		prefetch_set_unused(ring_index);
 	}
+
+	/*
+	 * We can have gone into retry due to network error, so update stats with
+	 * the latest available 
+	 */
+	MyNeonCounters->pageserver_open_requests =
+		MyPState->n_requests_inflight;
+	MyNeonCounters->getpage_prefetches_buffered =
+		MyPState->n_responses_buffered;
 }
 
 /*
@@ -706,6 +722,9 @@ prefetch_set_unused(uint64 ring_index)
 
 		MyPState->n_responses_buffered -= 1;
 		MyPState->n_unused += 1;
+
+		MyNeonCounters->getpage_prefetches_buffered =
+			MyPState->n_responses_buffered;
 	}
 	else
 	{
@@ -820,6 +839,15 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
 	hashkey.buftag = tag;
 
 Retry:
+	/*
+	 * We can have gone into retry due to network error, so update stats with
+	 * the latest available 
+	 */
+	MyNeonCounters->pageserver_open_requests =
+		MyPState->ring_unused - MyPState->ring_receive;
+	MyNeonCounters->getpage_prefetches_buffered =
+		MyPState->n_responses_buffered;
+
 	min_ring_index = UINT64_MAX;
 	for (int i = 0; i < nblocks; i++)
 	{
@@ -1001,6 +1029,9 @@ Retry:
 		prefetch_do_request(slot, lsns);
 	}
 
+	MyNeonCounters->pageserver_open_requests =
+		MyPState->ring_unused - MyPState->ring_receive;
+
 	Assert(any_hits);
 
 	Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED ||
@@ -1076,8 +1107,10 @@ page_server_request(void const *req)
 			{
 				/* do nothing */
 			}
+			MyNeonCounters->pageserver_open_requests++;
 			consume_prefetch_responses();
 			resp = page_server->receive(shard_no);
+			MyNeonCounters->pageserver_open_requests--;
 		}
 		PG_CATCH();
 		{
@@ -1086,6 +1119,8 @@ page_server_request(void const *req)
 			 * point, but this currently seems fine for now.
 			 */
 			page_server->disconnect(shard_no);
+			MyNeonCounters->pageserver_open_requests = 0;
+
 			PG_RE_THROW();
 		}
 		PG_END_TRY();

From 0fc4ada3ca9b1eb264bff9c6407ad050722578ae Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 14 Oct 2024 21:12:43 +0100
Subject: [PATCH 08/57] Switch CI, Storage and Proxy to Debian 12 (Bookworm)
 (#9170)

## Problem

This PR switches CI and Storage to Debain 12 (Bookworm) based images.

## Summary of changes
- Add Debian codename (`bookworm`/`bullseye`) to most of docker tags,
create un-codenamed images to be used by default
- `vm-compute-node-image`: create a separate spec for `bookworm` (we
don't need to build cgroups in the future)
- `neon-image`: Switch to `bookworm`-based `build-tools` image
  - Storage components and Proxy use it
- CI: run lints and tests on `bookworm`-based `build-tools` image
---
 .../actions/allure-report-generate/action.yml |   2 +-
 .../actions/run-python-test-set/action.yml    |   2 +-
 .github/workflows/_build-and-test-locally.yml |   8 +-
 .github/workflows/build-build-tools-image.yml |  31 ++--
 .github/workflows/build_and_test.yml          | 136 ++++++++++--------
 .github/workflows/neon_extra_builds.yml       |   2 +-
 .github/workflows/pg-clients.yml              |   4 +-
 .github/workflows/pin-build-tools-image.yml   |  23 ++-
 Dockerfile                                    |   4 +-
 Dockerfile.build-tools                        |  19 +--
 compute/Dockerfile.compute-node               |  27 ++--
 compute/vm-image-spec-bookworm.yaml           | 126 ++++++++++++++++
 ...-spec.yaml => vm-image-spec-bullseye.yaml} |   0
 13 files changed, 280 insertions(+), 104 deletions(-)
 create mode 100644 compute/vm-image-spec-bookworm.yaml
 rename compute/{vm-image-spec.yaml => vm-image-spec-bullseye.yaml} (100%)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index 11adc8df86..2bdb727719 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -183,7 +183,7 @@ runs:
       uses: actions/cache@v4
       with:
         path: ~/.cache/pypoetry/virtualenvs
-        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
+        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
 
     - name: Store Allure test stat in the DB (new)
       if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 330e875d56..037b9aeb1e 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -88,7 +88,7 @@ runs:
       uses: actions/cache@v4
       with:
         path: ~/.cache/pypoetry/virtualenvs
-        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
+        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
 
     - name: Install Python deps
       shell: bash -euxo pipefail {0}
diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 5fc6aa247a..3aa671fab1 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -124,28 +124,28 @@ jobs:
         uses: actions/cache@v4
         with:
           path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
 
       - name: Cache postgres v15 build
         id: cache_pg_15
         uses: actions/cache@v4
         with:
           path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
 
       - name: Cache postgres v16 build
         id: cache_pg_16
         uses: actions/cache@v4
         with:
           path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
 
       - name: Cache postgres v17 build
         id: cache_pg_17
         uses: actions/cache@v4
         with:
           path: pg_install/v17
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
 
       - name: Build postgres v14
         if: steps.cache_pg_14.outputs.cache-hit != 'true'
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index 130753833d..0f05276579 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -43,6 +43,7 @@ jobs:
 
     strategy:
       matrix:
+        debian-version: [ bullseye, bookworm ]
         arch: [ x64, arm64 ]
 
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
@@ -81,22 +82,22 @@ jobs:
 
       - uses: docker/build-push-action@v6
         with:
+          file: Dockerfile.build-tools
           context: .
           provenance: false
           push: true
           pull: true
-          file: Dockerfile.build-tools
-          cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }}
-          tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
+          build-args: |
+            DEBIAN_VERSION=${{ matrix.debian-version }}
+          cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.debian-version }}-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0}-{1},mode=max', matrix.debian-version, matrix.arch) || '' }}
+          tags: |
+            neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.debian-version }}-${{ matrix.arch }}
 
   merge-images:
     needs: [ build-image ]
     runs-on: ubuntu-22.04
 
-    env:
-      IMAGE_TAG: ${{ inputs.image-tag }}
-
     steps:
       - uses: docker/login-action@v3
         with:
@@ -104,7 +105,17 @@ jobs:
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
       - name: Create multi-arch image
+        env:
+          DEFAULT_DEBIAN_VERSION: bullseye
+          IMAGE_TAG: ${{ inputs.image-tag }}
         run: |
-          docker buildx imagetools create -t neondatabase/build-tools:${IMAGE_TAG} \
-                                             neondatabase/build-tools:${IMAGE_TAG}-x64 \
-                                             neondatabase/build-tools:${IMAGE_TAG}-arm64
+          for debian_version in bullseye bookworm; do
+            tags=("-t" "neondatabase/build-tools:${IMAGE_TAG}-${debian_version}")
+            if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then
+              tags+=("-t" "neondatabase/build-tools:${IMAGE_TAG}")
+            fi
+
+            docker buildx imagetools create "${tags[@]}" \
+                                              neondatabase/build-tools:${IMAGE_TAG}-${debian_version}-x64 \
+                                              neondatabase/build-tools:${IMAGE_TAG}-${debian_version}-arm64
+          done
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index e7193cfe19..51f6975e63 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -92,7 +92,7 @@ jobs:
     needs: [ check-permissions, build-build-tools-image ]
     runs-on: [ self-hosted, small ]
     container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -106,7 +106,7 @@ jobs:
         uses: actions/cache@v4
         with:
           path: ~/.cache/pypoetry/virtualenvs
-          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
+          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
 
       - name: Install Python deps
         run: ./scripts/pysync
@@ -181,7 +181,7 @@ jobs:
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
 
     container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -261,7 +261,7 @@ jobs:
     uses: ./.github/workflows/_build-and-test-locally.yml
     with:
       arch: ${{ matrix.arch }}
-      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
       build-tag: ${{ needs.tag.outputs.build-tag }}
       build-type: ${{ matrix.build-type }}
       # Run tests on all Postgres versions in release builds and only on the latest version in debug builds
@@ -276,7 +276,7 @@ jobs:
     needs: [ check-permissions, build-build-tools-image ]
     runs-on: [ self-hosted, small ]
     container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -289,7 +289,7 @@ jobs:
         uses: actions/cache@v4
         with:
           path: ~/.cache/pypoetry/virtualenvs
-          key: v1-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
+          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
 
       - name: Install Python deps
         run: ./scripts/pysync
@@ -309,7 +309,7 @@ jobs:
     needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
     runs-on: [ self-hosted, small ]
     container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -367,7 +367,7 @@ jobs:
 
     runs-on: [ self-hosted, small ]
     container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -415,7 +415,7 @@ jobs:
     needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
     runs-on: [ self-hosted, small ]
     container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -559,15 +559,16 @@ jobs:
             ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }}
             GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
             BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-bookworm
+            DEBIAN_VERSION=bookworm
           provenance: false
           push: true
           pull: true
           file: Dockerfile
-          cache-from: type=registry,ref=cache.neon.build/neon:cache-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0},mode=max', matrix.arch) || '' }}
+          cache-from: type=registry,ref=cache.neon.build/neon:cache-bookworm-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0}-{1},mode=max', 'bookworm', matrix.arch) || '' }}
           tags: |
-            neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
+            neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-${{ matrix.arch }}
 
   neon-image:
     needs: [ neon-image-arch, tag ]
@@ -582,8 +583,9 @@ jobs:
       - name: Create multi-arch image
         run: |
           docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \
-                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \
-                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64
+                                          -t neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm \
+                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \
+                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64
 
       - uses: docker/login-action@v3
         with:
@@ -604,17 +606,16 @@ jobs:
         version:
           # Much data was already generated on old PG versions with bullseye's
           # libraries, the locales of which can cause data incompatibilities.
-          # However, new PG versions should check if they can be built on newer
-          # images, as that reduces the support burden of old and ancient
-          # distros.
+          # However, new PG versions should be build on newer images,
+          # as that reduces the support burden of old and ancient distros.
           - pg: v14
-            debian: bullseye-slim
+            debian: bullseye
           - pg: v15
-            debian: bullseye-slim
+            debian: bullseye
           - pg: v16
-            debian: bullseye-slim
+            debian: bullseye
           - pg: v17
-            debian: bookworm-slim
+            debian: bookworm
         arch: [ x64, arm64 ]
 
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
@@ -659,16 +660,16 @@ jobs:
             GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
             PG_VERSION=${{ matrix.version.pg }}
             BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
-            DEBIAN_FLAVOR=${{ matrix.version.debian }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
+            DEBIAN_VERSION=${{ matrix.version.debian }}
           provenance: false
           push: true
           pull: true
           file: compute/Dockerfile.compute-node
-          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }}
+          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
           tags: |
-            neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
+            neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
 
       - name: Build neon extensions test image
         if: matrix.version.pg == 'v16'
@@ -679,17 +680,17 @@ jobs:
             GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
             PG_VERSION=${{ matrix.version.pg }}
             BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
-            DEBIAN_FLAVOR=${{ matrix.version.debian }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
+            DEBIAN_VERSION=${{ matrix.version.debian }}
           provenance: false
           push: true
           pull: true
           file: compute/Dockerfile.compute-node
           target: neon-pg-ext-test
-          cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }}
+          cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
           tags: |
-            neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
+            neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
 
       - name: Build compute-tools image
         # compute-tools are Postgres independent, so build it only once
@@ -704,14 +705,16 @@ jobs:
           build-args: |
             GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
             BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
-            DEBIAN_FLAVOR=${{ matrix.version.debian }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
+            DEBIAN_VERSION=${{ matrix.version.debian }}
           provenance: false
           push: true
           pull: true
           file: compute/Dockerfile.compute-node
+          cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
           tags: |
-            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
+            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
 
   compute-node-image:
     needs: [ compute-node-image-arch, tag ]
@@ -719,7 +722,16 @@ jobs:
 
     strategy:
       matrix:
-        version: [ v14, v15, v16, v17 ]
+        version:
+          # see the comment for `compute-node-image-arch` job
+          - pg: v14
+            debian: bullseye
+          - pg: v15
+            debian: bullseye
+          - pg: v16
+            debian: bullseye
+          - pg: v17
+            debian: bookworm
 
     steps:
       - uses: docker/login-action@v3
@@ -729,23 +741,26 @@ jobs:
 
       - name: Create multi-arch compute-node image
         run: |
-          docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
-                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
+          docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
+                                          -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
+                                             neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
+                                             neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
 
       - name: Create multi-arch neon-test-extensions image
-        if: matrix.version == 'v16'
+        if: matrix.version.pg == 'v16'
         run: |
-          docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-                                             neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
-                                             neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
+          docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
+                                          -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
+                                             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
+                                             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
 
       - name: Create multi-arch compute-tools image
-        if: matrix.version == 'v17'
+        if: matrix.version.pg == 'v16'
         run: |
           docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
-                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \
-                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64
+                                          -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
+                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
+                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
 
       - uses: docker/login-action@v3
         with:
@@ -753,13 +768,13 @@ jobs:
           username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
           password: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
-      - name: Push multi-arch compute-node-${{ matrix.version }} image to ECR
+      - name: Push multi-arch compute-node-${{ matrix.version.pg }} image to ECR
         run: |
-          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-                                                                                neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
+                                                                                neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
 
       - name: Push multi-arch compute-tools image to ECR
-        if: matrix.version == 'v17'
+        if: matrix.version.pg == 'v16'
         run: |
           docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
                                                                                 neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
@@ -770,7 +785,16 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        version: [ v14, v15, v16, v17 ]
+        version:
+          # see the comment for `compute-node-image-arch` job
+          - pg: v14
+            debian: bullseye
+          - pg: v15
+            debian: bullseye
+          - pg: v16
+            debian: bullseye
+          - pg: v17
+            debian: bookworm
     env:
       VM_BUILDER_VERSION: v0.35.0
 
@@ -792,18 +816,18 @@ jobs:
       # it won't have the proper authentication (written at v0.6.0)
       - name: Pulling compute-node image
         run: |
-          docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+          docker pull neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
 
       - name: Build vm image
         run: |
           ./vm-builder \
-            -spec=compute/vm-image-spec.yaml \
-            -src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-            -dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+            -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
+            -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
+            -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
 
       - name: Pushing vm-compute-node image
         run: |
-          docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+          docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
 
   test-images:
     needs: [ check-permissions, tag, neon-image, compute-node-image ]
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 140aac032a..287c9ea281 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -155,7 +155,7 @@ jobs:
       github.ref_name == 'main'
     runs-on: [ self-hosted, large ]
     container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml
index 23a2e3876c..df40b5beda 100644
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -55,7 +55,7 @@ jobs:
     runs-on: ubuntu-22.04
 
     container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -150,7 +150,7 @@ jobs:
     runs-on: ubuntu-22.04
 
     container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
index 2e79498fc4..c196d07d3e 100644
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -71,7 +71,6 @@ jobs:
 
     steps:
       - uses: docker/login-action@v3
-
         with:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -94,8 +93,22 @@ jobs:
           az acr login --name=neoneastus2
 
       - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR
+        env:
+          DEFAULT_DEBIAN_VERSION: bullseye
         run: |
-          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
-                                          -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \
-                                          -t neondatabase/build-tools:${TO_TAG} \
-                                             neondatabase/build-tools:${FROM_TAG}
+          for debian_version in bullseye bookworm; do
+            tags=()
+
+            tags+=("-t" "neondatabase/build-tools:${TO_TAG}-${debian_version}")
+            tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}-${debian_version}")
+            tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}-${debian_version}")
+
+            if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then
+              tags+=("-t" "neondatabase/build-tools:${TO_TAG}")
+              tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}")
+              tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}")
+            fi
+
+            docker buildx imagetools create "${tags[@]}" \
+                                              neondatabase/build-tools:${FROM_TAG}-${debian_version}
+          done
diff --git a/Dockerfile b/Dockerfile
index bdb76a4f4f..785dd4598e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,6 +7,8 @@ ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG DEFAULT_PG_VERSION=17
 ARG STABLE_PG_VERSION=16
+ARG DEBIAN_VERSION=bullseye
+ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
 
 # Build Postgres
 FROM $REPOSITORY/$IMAGE:$TAG AS pg-build
@@ -57,7 +59,7 @@ RUN set -e \
 
 # Build final image
 #
-FROM debian:bullseye-slim
+FROM debian:${DEBIAN_FLAVOR}
 ARG DEFAULT_PG_VERSION
 WORKDIR /data
 
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index d8bcacf228..54e9134257 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -1,12 +1,7 @@
-FROM debian:bullseye-slim
+ARG DEBIAN_VERSION=bullseye
 
-# Use ARG as a build-time environment variable here to allow.
-# It's not supposed to be set outside.
-# Alternatively it can be obtained using the following command
-# ```
-# . /etc/os-release && echo "${VERSION_CODENAME}"
-# ```
-ARG DEBIAN_VERSION_CODENAME=bullseye
+FROM debian:${DEBIAN_VERSION}-slim
+ARG DEBIAN_VERSION
 
 # Add nonroot user
 RUN useradd -ms /bin/bash nonroot -b /home
@@ -42,14 +37,14 @@ RUN set -e \
         libseccomp-dev \
         libsqlite3-dev \
         libssl-dev \
-        libstdc++-10-dev \
+        $([[ "${DEBIAN_VERSION}" = "bullseye" ]] && libstdc++-10-dev || libstdc++-11-dev) \
         libtool \
         libxml2-dev \
         libxmlsec1-dev \
         libxxhash-dev \
         lsof \
         make \
-        netcat \
+        netcat-openbsd \
         net-tools \
         openssh-client \
         parallel \
@@ -78,7 +73,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
 # LLVM
 ENV LLVM_VERSION=18
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
-    && echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
+    && echo "deb http://apt.llvm.org/${DEBIAN_VERSION}/ llvm-toolchain-${DEBIAN_VERSION}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
     && apt update \
     && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
     && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
@@ -86,7 +81,7 @@ RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
 
 # Install docker
 RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
-    && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \
+    && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION} stable" > /etc/apt/sources.list.d/docker.list \
     && apt update \
     && apt install -y docker-ce docker-ce-cli \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node
index 15afb9897f..91528618da 100644
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -3,7 +3,8 @@ ARG REPOSITORY=neondatabase
 ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG BUILD_TAG
-ARG DEBIAN_FLAVOR=bullseye-slim
+ARG DEBIAN_VERSION=bullseye
+ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
 
 #########################################################################################
 #
@@ -11,20 +12,23 @@ ARG DEBIAN_FLAVOR=bullseye-slim
 #
 #########################################################################################
 FROM debian:$DEBIAN_FLAVOR AS build-deps
-ARG DEBIAN_FLAVOR
+ARG DEBIAN_VERSION
 
-RUN case $DEBIAN_FLAVOR in \
+RUN case $DEBIAN_VERSION in \
       # Version-specific installs for Bullseye (PG14-PG16):
       # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18.
       # Install newer version (3.25) from backports.
-      bullseye*) \
+      bullseye) \
         echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/bullseye-backports.list; \
         VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports"; \
       ;; \
       # Version-specific installs for Bookworm (PG17):
-      bookworm*) \
+      bookworm) \
         VERSION_INSTALLS="cmake"; \
       ;; \
+      *) \
+        echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \
+      ;; \
     esac && \
     apt update &&  \
     apt install --no-install-recommends -y git autoconf automake libtool build-essential bison flex libreadline-dev \
@@ -1091,7 +1095,6 @@ RUN cd compute_tools && mold -run cargo build --locked --profile release-line-de
 #########################################################################################
 
 FROM debian:$DEBIAN_FLAVOR AS compute-tools-image
-ARG DEBIAN_FLAVOR
 
 COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
 
@@ -1102,7 +1105,6 @@ COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compu
 #########################################################################################
 
 FROM debian:$DEBIAN_FLAVOR AS pgbouncer
-ARG DEBIAN_FLAVOR
 RUN set -e \
     && apt-get update \
     && apt-get install --no-install-recommends -y \
@@ -1257,7 +1259,7 @@ ENV PGDATABASE=postgres
 #
 #########################################################################################
 FROM debian:$DEBIAN_FLAVOR
-ARG DEBIAN_FLAVOR
+ARG DEBIAN_VERSION
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
     echo "postgres:test_console_pass" | chpasswd && \
@@ -1305,19 +1307,22 @@ RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/loca
 
 
 RUN apt update && \
-    case $DEBIAN_FLAVOR in \
+    case $DEBIAN_VERSION in \
       # Version-specific installs for Bullseye (PG14-PG16):
       # libicu67, locales for collations (including ICU and plpgsql_check)
       # libgdal28, libproj19 for PostGIS
-      bullseye*) \
+      bullseye) \
         VERSION_INSTALLS="libicu67 libgdal28 libproj19"; \
       ;; \
       # Version-specific installs for Bookworm (PG17):
       # libicu72, locales for collations (including ICU and plpgsql_check)
       # libgdal32, libproj25 for PostGIS
-      bookworm*) \
+      bookworm) \
         VERSION_INSTALLS="libicu72 libgdal32 libproj25"; \
       ;; \
+      *) \
+        echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \
+      ;; \
     esac && \
     apt install --no-install-recommends -y \
         gdb \
diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml
new file mode 100644
index 0000000000..51a55b513f
--- /dev/null
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -0,0 +1,126 @@
+# Supplemental file for neondatabase/autoscaling's vm-builder, for producing the VM compute image.
+---
+commands:
+  - name: cgconfigparser
+    user: root
+    sysvInitAction: sysinit
+    shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664'
+  # restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for
+  # running it as root.
+  - name: chmod-resize-swap
+    user: root
+    sysvInitAction: sysinit
+    shell: 'chmod 711 /neonvm/bin/resize-swap'
+  - name: chmod-set-disk-quota
+    user: root
+    sysvInitAction: sysinit
+    shell: 'chmod 711 /neonvm/bin/set-disk-quota'
+  - name: pgbouncer
+    user: postgres
+    sysvInitAction: respawn
+    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
+  - name: local_proxy
+    user: postgres
+    sysvInitAction: respawn
+    shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
+  - name: postgres-exporter
+    user: nobody
+    sysvInitAction: respawn
+    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
+  - name: sql-exporter
+    user: nobody
+    sysvInitAction: respawn
+    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399'
+  - name: sql-exporter-autoscaling
+    user: nobody
+    sysvInitAction: respawn
+    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
+shutdownHook: |
+  su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
+files:
+  - filename: compute_ctl-sudoers
+    content: |
+      # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
+      # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
+      # regardless of hostname (ALL)
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota
+  - filename: cgconfig.conf
+    content: |
+      # Configuration for cgroups in VM compute nodes
+      group neon-postgres {
+          perm {
+              admin {
+                  uid = postgres;
+              }
+              task {
+                  gid = users;
+              }
+          }
+          memory {}
+      }
+build: |
+  # Build cgroup-tools
+  #
+  # At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
+  # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor
+  # requires cgroup v2, so we'll build cgroup-tools ourselves.
+  #
+  # At time of migration to bookworm (2024-10-09), debian has a version of libcgroup/cgroup-tools 2.0.2,
+  # and it _probably_ can be used as-is. However, we'll build it ourselves to minimise the changeset
+  # for debian version migration.
+  #
+  FROM debian:bookworm-slim as libcgroup-builder
+  ENV LIBCGROUP_VERSION=v2.0.3
+
+  RUN set -exu \
+      && apt update \
+      && apt install --no-install-recommends -y \
+          git \
+          ca-certificates \
+          automake \
+          cmake \
+          make \
+          gcc \
+          byacc \
+          flex \
+          libtool \
+          libpam0g-dev \
+      && git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \
+      && INSTALL_DIR="/libcgroup-install" \
+      && mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \
+      && cd libcgroup \
+      # extracted from bootstrap.sh, with modified flags:
+      && (test -d m4 || mkdir m4) \
+      && autoreconf -fi \
+      && rm -rf autom4te.cache \
+      && CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
+      # actually build the thing...
+      && make install
+merge: |
+  # tweak nofile limits
+  RUN set -e \
+      && echo 'fs.file-max = 1048576' >>/etc/sysctl.conf \
+      && test ! -e /etc/security || ( \
+         echo '*    - nofile 1048576' >>/etc/security/limits.conf \
+      && echo 'root - nofile 1048576' >>/etc/security/limits.conf \
+         )
+
+  # Allow postgres user (compute_ctl) to run swap resizer.
+  # Need to install sudo in order to allow this.
+  #
+  # Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe.
+  RUN set -e \
+      && apt update \
+      && apt install --no-install-recommends -y \
+             sudo \
+      && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+  COPY compute_ctl-sudoers /etc/sudoers.d/compute_ctl-sudoers
+
+  COPY cgconfig.conf /etc/cgconfig.conf
+
+  RUN set -e \
+      && chmod 0644 /etc/cgconfig.conf
+
+  COPY --from=libcgroup-builder /libcgroup-install/bin/*  /usr/bin/
+  COPY --from=libcgroup-builder /libcgroup-install/lib/*  /usr/lib/
+  COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
diff --git a/compute/vm-image-spec.yaml b/compute/vm-image-spec-bullseye.yaml
similarity index 100%
rename from compute/vm-image-spec.yaml
rename to compute/vm-image-spec-bullseye.yaml

From 73c6626b381bd013064d72332c3a0a372c555877 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 15 Oct 2024 09:31:18 +0100
Subject: [PATCH 09/57] pageserver: stabilize & refine controller scale test
 (#8971)

## Problem

We were seeing timeouts on migrations in this test.

The test unfortunately tends to saturate local storage, which is shared
between the pageservers and the control plane database, which makes the
test kind of unrealistic. We will also want to increase the scale of
this test, so it's worth fixing that.

## Summary of changes

- Instead of randomly creating timelines at the same time as the other
background operations, explicitly identify a subset of tenant which will
have timelines, and create them at the start. This avoids pageservers
putting a lot of load on the test node during the main body of the test.
- Adjust the tenants created to create some number of 8 shard tenants
and the rest 1 shard tenants, instead of just creating a lot of 2 shard
tenants.
- Use archival_config to exercise tenant-mutating operations, instead of
using timeline creation for this.
- Adjust reconcile_until_idle calls to avoid waiting 5 seconds between
calls, which causes timelines with large shard count tenants.
- Fix a pageserver bug where calls to archival_config during activation
get 404
---
 libs/utils/src/http/error.rs                  |   7 +
 pageserver/src/http/routes.rs                 |   2 +
 proxy/src/serverless/http_util.rs             |   4 +
 storage_controller/src/service.rs             |   5 +
 test_runner/fixtures/neon_fixtures.py         |   6 +-
 .../test_storage_controller_scale.py          | 225 ++++++++++++++----
 6 files changed, 204 insertions(+), 45 deletions(-)

diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index 5e05e4e713..02fc9e3b99 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -28,6 +28,9 @@ pub enum ApiError {
     #[error("Resource temporarily unavailable: {0}")]
     ResourceUnavailable(Cow<'static, str>),
 
+    #[error("Too many requests: {0}")]
+    TooManyRequests(Cow<'static, str>),
+
     #[error("Shutting down")]
     ShuttingDown,
 
@@ -73,6 +76,10 @@ impl ApiError {
                 err.to_string(),
                 StatusCode::SERVICE_UNAVAILABLE,
             ),
+            ApiError::TooManyRequests(err) => HttpErrorBody::response_from_msg_and_status(
+                err.to_string(),
+                StatusCode::TOO_MANY_REQUESTS,
+            ),
             ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
                 err.to_string(),
                 StatusCode::REQUEST_TIMEOUT,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 2985ab1efb..1079d8df29 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -715,6 +715,8 @@ async fn timeline_archival_config_handler(
             .tenant_manager
             .get_attached_tenant_shard(tenant_shard_id)?;
 
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
         tenant
             .apply_timeline_archival_config(timeline_id, request_data.state, ctx)
             .await?;
diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs
index 87a72ec5f0..c1c5764d17 100644
--- a/proxy/src/serverless/http_util.rs
+++ b/proxy/src/serverless/http_util.rs
@@ -41,6 +41,10 @@ pub(crate) fn api_error_into_response(this: ApiError) -> Response<BoxBody<Bytes,
             err.to_string(),
             StatusCode::SERVICE_UNAVAILABLE,
         ),
+        ApiError::TooManyRequests(err) => HttpErrorBody::response_from_msg_and_status(
+            err.to_string(),
+            StatusCode::TOO_MANY_REQUESTS,
+        ),
         ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
             err.to_string(),
             StatusCode::REQUEST_TIMEOUT,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index cc735dc27e..cedee54534 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -246,6 +246,11 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError {
             // storage controller's auth configuration.
             ApiError::InternalServerError(anyhow::anyhow!("{node} {status}: {msg}"))
         }
+        mgmt_api::Error::ApiError(status @ StatusCode::TOO_MANY_REQUESTS, msg) => {
+            // Pass through 429 errors: if pageserver is asking us to wait + retry, we in
+            // turn ask our clients to wait + retry
+            ApiError::Conflict(format!("{node} {status}: {status} {msg}"))
+        }
         mgmt_api::Error::ApiError(status, msg) => {
             // Presume general case of pageserver API errors is that we tried to do something
             // that can't be done right now.
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 059707c8ed..a313ac2ed3 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1986,11 +1986,11 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"reconcile_all waited for {n} shards")
         return n
 
-    def reconcile_until_idle(self, timeout_secs=30):
+    def reconcile_until_idle(self, timeout_secs=30, max_interval=5):
         start_at = time.time()
         n = 1
-        delay_sec = 0.5
-        delay_max = 5
+        delay_sec = 0.1
+        delay_max = max_interval
         while n > 0:
             n = self.reconcile_all()
             if n == 0:
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 452a856714..d2eba751f8 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -4,9 +4,10 @@ import concurrent.futures
 import random
 import time
 from collections import defaultdict
+from enum import Enum
 
 import pytest
-from fixtures.common_types import TenantId, TenantShardId, TimelineId
+from fixtures.common_types import TenantId, TenantShardId, TimelineArchivalState, TimelineId
 from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
@@ -34,6 +35,7 @@ def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[
         if tenant_placement[tid]["intent"]["attached"]
         == tenant_placement[tid]["observed"]["attached"]
     }
+
     assert len(matching) == total_shards
 
     attached_per_node: defaultdict[str, int] = defaultdict(int)
@@ -107,15 +109,48 @@ def test_storage_controller_many_tenants(
         ps.allowed_errors.append(".*request was dropped before completing.*")
 
     # Total tenants
-    tenant_count = 4000
+    small_tenant_count = 7800
+    large_tenant_count = 200
+    tenant_count = small_tenant_count + large_tenant_count
+    large_tenant_shard_count = 8
+    total_shards = small_tenant_count + large_tenant_count * large_tenant_shard_count
 
-    # Shards per tenant
-    shard_count = 2
-    stripe_size = 1024
+    # A small stripe size to encourage all shards to get some data
+    stripe_size = 1
 
-    total_shards = tenant_count * shard_count
+    # We use a fixed seed to make the test somewhat reproducible: we want a randomly
+    # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run.
+    rng = random.Random(1234)
 
-    tenants = set(TenantId.generate() for _i in range(0, tenant_count))
+    class Tenant:
+        def __init__(self):
+            # Tenants may optionally contain a timeline
+            self.timeline_id = None
+
+            # Tenants may be marked as 'large' to get multiple shard during creation phase
+            self.large = False
+
+    tenant_ids = list(TenantId.generate() for _i in range(0, tenant_count))
+    tenants = dict((tid, Tenant()) for tid in tenant_ids)
+
+    # We will create timelines in only a subset of tenants, because creating timelines
+    # does many megabytes of IO, and we want to densely simulate huge tenant counts on
+    # a single test node.
+    tenant_timelines_count = 100
+
+    # These lists are maintained for use with rng.choice
+    tenants_with_timelines = list(rng.sample(tenants.keys(), tenant_timelines_count))
+    tenants_without_timelines = list(
+        tenant_id for tenant_id in tenants if tenant_id not in tenants_with_timelines
+    )
+
+    # For our sharded tenants, we will make half of them with timelines and half without
+    assert large_tenant_count >= tenant_timelines_count / 2
+    for tenant_id in tenants_with_timelines[0 : large_tenant_count // 2]:
+        tenants[tenant_id].large = True
+
+    for tenant_id in tenants_without_timelines[0 : large_tenant_count // 2]:
+        tenants[tenant_id].large = True
 
     virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
 
@@ -125,23 +160,39 @@ def test_storage_controller_many_tenants(
 
         rss = env.storage_controller.get_metric_value("process_resident_memory_bytes")
         assert rss is not None
-        log.info(f"Resident memory: {rss} ({ rss / (shard_count * tenant_count)} per shard)")
-        assert rss < expect_memory_per_shard * shard_count * tenant_count
-
-    # We use a fixed seed to make the test somewhat reproducible: we want a randomly
-    # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run.
-    rng = random.Random(1234)
+        log.info(f"Resident memory: {rss} ({ rss / total_shards} per shard)")
+        assert rss < expect_memory_per_shard * total_shards
 
     # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore
     # permits, to ensure that we are exercising stressing that.
     api_concurrency = 135
 
-    # We will create tenants directly via API, not via neon_local, to avoid any false
-    # serialization of operations in neon_local (it e.g. loads/saves a config file on each call)
-    with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor:
-        futs = []
+    # A different concurrency limit for bulk tenant+timeline creations: these do I/O and will
+    # start timing on test nodes if we aren't a bit careful.
+    create_concurrency = 16
+
+    class Operation(str, Enum):
+        TIMELINE_OPS = "timeline_ops"
+        SHARD_MIGRATE = "shard_migrate"
+        TENANT_PASSTHROUGH = "tenant_passthrough"
+
+    run_ops = api_concurrency * 4
+    assert run_ops < len(tenants)
+
+    # Creation phase: make a lot of tenants, and create timelines in a subset of them
+    # This executor has concurrency set modestly, to avoid overloading pageservers with timeline creations.
+    with concurrent.futures.ThreadPoolExecutor(max_workers=create_concurrency) as executor:
+        tenant_create_futs = []
         t1 = time.time()
-        for tenant_id in tenants:
+
+        for tenant_id, tenant in tenants.items():
+            if tenant.large:
+                shard_count = large_tenant_shard_count
+            else:
+                shard_count = 1
+
+            # We will create tenants directly via API, not via neon_local, to avoid any false
+            # serialization of operations in neon_local (it e.g. loads/saves a config file on each call)
             f = executor.submit(
                 env.storage_controller.tenant_create,
                 tenant_id,
@@ -152,44 +203,106 @@ def test_storage_controller_many_tenants(
                 tenant_config={"heatmap_period": "10s"},
                 placement_policy={"Attached": 1},
             )
-            futs.append(f)
+            tenant_create_futs.append(f)
 
-        # Wait for creations to finish
-        for f in futs:
+        # Wait for tenant creations to finish
+        for f in tenant_create_futs:
             f.result()
         log.info(
             f"Created {len(tenants)} tenants in {time.time() - t1}, {len(tenants) / (time.time() - t1)}/s"
         )
 
-        run_ops = api_concurrency * 4
-        assert run_ops < len(tenants)
-        op_tenants = list(tenants)[0:run_ops]
+        # Waiting for optimizer to stabilize, if it disagrees with scheduling (the correct behavior
+        # would be for original scheduling decisions to always match optimizer's preference)
+        # (workaround for https://github.com/neondatabase/neon/issues/8969)
+        env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
+
+        # Create timelines in those tenants which are going to get one
+        t1 = time.time()
+        timeline_create_futs = []
+        for tenant_id in tenants_with_timelines:
+            timeline_id = TimelineId.generate()
+            tenants[tenant_id].timeline_id = timeline_id
+            f = executor.submit(
+                env.storage_controller.pageserver_api().timeline_create,
+                PgVersion.NOT_SET,
+                tenant_id,
+                timeline_id,
+            )
+            timeline_create_futs.append(f)
+
+        for f in timeline_create_futs:
+            f.result()
+        log.info(
+            f"Created {len(tenants_with_timelines)} timelines in {time.time() - t1}, {len(tenants_with_timelines) / (time.time() - t1)}/s"
+        )
+
+    # Plan operations: ensure each tenant with a timeline gets at least
+    # one of each operation type.  Then add other tenants to make up the
+    # numbers.
+    ops_plan = []
+    for tenant_id in tenants_with_timelines:
+        ops_plan.append((tenant_id, Operation.TIMELINE_OPS))
+        ops_plan.append((tenant_id, Operation.SHARD_MIGRATE))
+        ops_plan.append((tenant_id, Operation.TENANT_PASSTHROUGH))
+
+    # Fill up remaining run_ops with migrations of tenants without timelines
+    other_migrate_tenants = rng.sample(tenants_without_timelines, run_ops - len(ops_plan))
+
+    for tenant_id in other_migrate_tenants:
+        ops_plan.append(
+            (
+                tenant_id,
+                rng.choice([Operation.SHARD_MIGRATE, Operation.TENANT_PASSTHROUGH]),
+            )
+        )
+
+    # Exercise phase: pick pseudo-random operations to do on the tenants + timelines
+    # This executor has concurrency high enough to stress the storage controller API.
+    with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor:
+
+        def exercise_timeline_ops(tenant_id, timeline_id):
+            # A read operation: this requires looking up shard zero and routing there
+            detail = virtual_ps_http.timeline_detail(tenant_id, timeline_id)
+            assert detail["timeline_id"] == str(timeline_id)
+
+            # A fan-out write operation to all shards in a tenant.
+            # - We use a metadata operation rather than something like a timeline create, because
+            #   timeline creations are I/O intensive and this test isn't meant to be a stress test for
+            #   doing lots of concurrent timeline creations.
+            archival_state = rng.choice(
+                [TimelineArchivalState.ARCHIVED, TimelineArchivalState.UNARCHIVED]
+            )
+            virtual_ps_http.timeline_archival_config(tenant_id, timeline_id, archival_state)
 
         # Generate a mixture of operations and dispatch them all concurrently
         futs = []
-        for tenant_id in op_tenants:
-            op = rng.choice([0, 1, 2])
-            if op == 0:
-                # A fan-out write operation to all shards in a tenant (timeline creation)
+        for tenant_id, op in ops_plan:
+            if op == Operation.TIMELINE_OPS:
+                op_timeline_id = tenants[tenant_id].timeline_id
+                assert op_timeline_id is not None
+
+                # Exercise operations that modify tenant scheduling state but require traversing
+                # the fan-out-to-all-shards functionality.
                 f = executor.submit(
-                    virtual_ps_http.timeline_create,
-                    PgVersion.NOT_SET,
+                    exercise_timeline_ops,
                     tenant_id,
-                    TimelineId.generate(),
+                    op_timeline_id,
                 )
-            elif op == 1:
+            elif op == Operation.SHARD_MIGRATE:
                 # A reconciler operation: migrate a shard.
-                shard_number = rng.randint(0, shard_count - 1)
-                tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count)
+                desc = env.storage_controller.tenant_describe(tenant_id)
+
+                shard_number = rng.randint(0, len(desc["shards"]) - 1)
+                tenant_shard_id = TenantShardId(tenant_id, shard_number, len(desc["shards"]))
 
                 # Migrate it to its secondary location
-                desc = env.storage_controller.tenant_describe(tenant_id)
                 dest_ps_id = desc["shards"][shard_number]["node_secondary"][0]
 
                 f = executor.submit(
                     env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id
                 )
-            elif op == 2:
+            elif op == Operation.TENANT_PASSTHROUGH:
                 # A passthrough read to shard zero
                 f = executor.submit(virtual_ps_http.tenant_status, tenant_id)
 
@@ -199,10 +312,18 @@ def test_storage_controller_many_tenants(
         for f in futs:
             f.result()
 
+    log.info("Completed mixed operations phase")
+
     # Some of the operations above (notably migrations) might leave the controller in a state where it has
     # some work to do, for example optimizing shard placement after we do a random migration. Wait for the system
     # to reach a quiescent state before doing following checks.
-    env.storage_controller.reconcile_until_idle()
+    #
+    # - Set max_interval low because we probably have a significant number of optimizations to complete and would like
+    #   the test to run quickly.
+    # - Set timeout high because we might be waiting for optimizations that reuqire a secondary
+    #   to warm up, and if we just started a secondary in the previous step, it might wait some time
+    #   before downloading its heatmap
+    env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
 
     env.storage_controller.consistency_check()
     check_memory()
@@ -213,6 +334,7 @@ def test_storage_controller_many_tenants(
     #
     # We do not require that the system is quiescent already here, although at present in this point in the test
     # that may be the case.
+    log.info("Reconciling all & timing")
     while True:
         t1 = time.time()
         reconcilers = env.storage_controller.reconcile_all()
@@ -225,6 +347,7 @@ def test_storage_controller_many_tenants(
             break
 
     # Restart the storage controller
+    log.info("Restarting controller")
     env.storage_controller.stop()
     env.storage_controller.start()
 
@@ -246,7 +369,16 @@ def test_storage_controller_many_tenants(
 
     # Restart pageservers gracefully: this exercises the /re-attach pageserver API
     # and the storage controller drain and fill API
+    log.info("Restarting pageservers...")
+
+    # Parameters for how long we expect it to take to migrate all of the tenants from/to
+    # a node during a drain/fill operation
+    DRAIN_FILL_TIMEOUT = 240
+    DRAIN_FILL_BACKOFF = 5
+
     for ps in env.pageservers:
+        log.info(f"Draining pageserver {ps.id}")
+        t1 = time.time()
         env.storage_controller.retryable_node_operation(
             lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
         )
@@ -255,9 +387,10 @@ def test_storage_controller_many_tenants(
             ps.id,
             PageserverAvailability.ACTIVE,
             PageserverSchedulingPolicy.PAUSE_FOR_RESTART,
-            max_attempts=24,
-            backoff=5,
+            max_attempts=DRAIN_FILL_TIMEOUT // DRAIN_FILL_BACKOFF,
+            backoff=DRAIN_FILL_BACKOFF,
         )
+        log.info(f"Drained pageserver {ps.id} in {time.time() - t1}s")
 
         shard_counts = get_consistent_node_shard_counts(env, total_shards)
         log.info(f"Shard counts after draining node {ps.id}: {shard_counts}")
@@ -275,6 +408,7 @@ def test_storage_controller_many_tenants(
             backoff=1,
         )
 
+        log.info(f"Filling pageserver {ps.id}")
         env.storage_controller.retryable_node_operation(
             lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2
         )
@@ -282,16 +416,23 @@ def test_storage_controller_many_tenants(
             ps.id,
             PageserverAvailability.ACTIVE,
             PageserverSchedulingPolicy.ACTIVE,
-            max_attempts=24,
-            backoff=5,
+            max_attempts=DRAIN_FILL_TIMEOUT // DRAIN_FILL_BACKOFF,
+            backoff=DRAIN_FILL_BACKOFF,
         )
 
+        log.info(f"Filled pageserver {ps.id} in {time.time() - t1}s")
+
+        # Waiting for optimizer to stabilize, if it disagrees with scheduling (the correct behavior
+        # would be for original scheduling decisions to always match optimizer's preference)
+        # (workaround for https://github.com/neondatabase/neon/issues/8969)
+        env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
+
         shard_counts = get_consistent_node_shard_counts(env, total_shards)
         log.info(f"Shard counts after filling node {ps.id}: {shard_counts}")
 
         assert_consistent_balanced_attachments(env, total_shards)
 
-        env.storage_controller.reconcile_until_idle()
+        env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
         env.storage_controller.consistency_check()
 
     # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn,

From ec4cc30de9bc1140761a7f8b7e4a5886c4d3b4c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 15 Oct 2024 11:46:51 +0200
Subject: [PATCH 10/57] Shut down timelines during offload and add offload
 tests (#9289)

Add a test for timeline offloading, and subsequent unoffloading.

Also adds a manual endpoint, and issues a proper timeline shutdown
during offloading which prevents a pageserver hang at shutdown.

Part of #8088.
---
 pageserver/src/http/routes.rs                | 49 ++++++++++++
 pageserver/src/tenant.rs                     | 29 +++++++
 pageserver/src/tenant/timeline/offload.rs    |  3 +
 test_runner/fixtures/pageserver/http.py      | 16 ++++
 test_runner/regress/test_timeline_archive.py | 84 ++++++++++++++++++++
 5 files changed, 181 insertions(+)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 1079d8df29..dd403c1cef 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -77,6 +77,7 @@ use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::storage_layer::LayerName;
+use crate::tenant::timeline::offload::offload_timeline;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
@@ -325,6 +326,7 @@ impl From<crate::tenant::TimelineArchivalError> for ApiError {
         match value {
             NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
             Timeout => ApiError::Timeout("hit pageserver internal timeout".into()),
+            Cancelled => ApiError::ShuttingDown,
             e @ HasArchivedParent(_) => {
                 ApiError::PreconditionFailed(e.to_string().into_boxed_str())
             }
@@ -1785,6 +1787,49 @@ async fn timeline_compact_handler(
     .await
 }
 
+// Run offload immediately on given timeline.
+async fn timeline_offload_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+
+    async {
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+
+        if tenant.get_offloaded_timeline(timeline_id).is_ok() {
+            return json_response(StatusCode::OK, ());
+        }
+        let timeline =
+            active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+                .await?;
+
+        if !tenant.timeline_has_no_attached_children(timeline_id) {
+            return Err(ApiError::PreconditionFailed(
+                "timeline has attached children".into(),
+            ));
+        }
+        if !timeline.can_offload() {
+            return Err(ApiError::PreconditionFailed(
+                "Timeline::can_offload() returned false".into(),
+            ));
+        }
+        offload_timeline(&tenant, &timeline)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+
+        json_response(StatusCode::OK, ())
+    }
+    .instrument(info_span!("manual_timeline_offload", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
+    .await
+}
+
 // Run checkpoint immediately on given timeline.
 async fn timeline_checkpoint_handler(
     request: Request<Body>,
@@ -3008,6 +3053,10 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
             |r| api_handler(r, timeline_compact_handler),
         )
+        .put(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/offload",
+            |r| testing_api_handler("attempt timeline offload", r, timeline_offload_handler),
+        )
         .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
             |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler),
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 397778d4c8..44d1bb74ca 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -619,6 +619,9 @@ pub enum TimelineArchivalError {
     #[error("Timeout")]
     Timeout,
 
+    #[error("Cancelled")]
+    Cancelled,
+
     #[error("ancestor is archived: {}", .0)]
     HasArchivedParent(TimelineId),
 
@@ -637,6 +640,7 @@ impl Debug for TimelineArchivalError {
         match self {
             Self::NotFound => write!(f, "NotFound"),
             Self::Timeout => write!(f, "Timeout"),
+            Self::Cancelled => write!(f, "Cancelled"),
             Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(),
             Self::HasUnarchivedChildren(c) => {
                 f.debug_tuple("HasUnarchivedChildren").field(c).finish()
@@ -1552,6 +1556,7 @@ impl Tenant {
         timeline_id: TimelineId,
         ctx: RequestContext,
     ) -> Result<Arc<Timeline>, TimelineArchivalError> {
+        info!("unoffloading timeline");
         let cancel = self.cancel.clone();
         let timeline_preload = self
             .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel)
@@ -1566,6 +1571,7 @@ impl Tenant {
                 error!(%timeline_id, "index_part not found on remote");
                 return Err(TimelineArchivalError::NotFound);
             }
+            Err(DownloadError::Cancelled) => return Err(TimelineArchivalError::Cancelled),
             Err(e) => {
                 // Some (possibly ephemeral) error happened during index_part download.
                 warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})");
@@ -1603,6 +1609,7 @@ impl Tenant {
             if offloaded_timelines.remove(&timeline_id).is_none() {
                 warn!("timeline already removed from offloaded timelines");
             }
+            info!("timeline unoffloading complete");
             Ok(Arc::clone(timeline))
         } else {
             warn!("timeline not available directly after attach");
@@ -1683,6 +1690,21 @@ impl Tenant {
         Ok(())
     }
 
+    pub fn get_offloaded_timeline(
+        &self,
+        timeline_id: TimelineId,
+    ) -> Result<Arc<OffloadedTimeline>, GetTimelineError> {
+        self.timelines_offloaded
+            .lock()
+            .unwrap()
+            .get(&timeline_id)
+            .map(Arc::clone)
+            .ok_or(GetTimelineError::NotFound {
+                tenant_id: self.tenant_shard_id,
+                timeline_id,
+            })
+    }
+
     pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
         self.tenant_shard_id
     }
@@ -2218,6 +2240,13 @@ impl Tenant {
         }
     }
 
+    pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool {
+        let timelines = self.timelines.lock().unwrap();
+        !timelines
+            .iter()
+            .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(timeline_id))
+    }
+
     pub fn current_state(&self) -> TenantState {
         self.state.borrow().clone()
     }
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index fb906d906b..7e6084baaf 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -19,6 +19,9 @@ pub(crate) async fn offload_timeline(
         return Ok(());
     };
 
+    // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
+    timeline.shutdown(super::ShutdownMode::Hard).await;
+
     // TODO extend guard mechanism above with method
     // to make deletions possible while offloading is in progress
 
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index aa4435af4e..18d65cb7de 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -583,6 +583,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         log.info(f"Got GC request response code: {res.status_code}")
         self.verbose_error(res)
 
+    def timeline_offload(
+        self,
+        tenant_id: Union[TenantId, TenantShardId],
+        timeline_id: TimelineId,
+    ):
+        self.is_testing_enabled_or_skip()
+
+        log.info(f"Requesting offload: tenant {tenant_id}, timeline {timeline_id}")
+        res = self.put(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/offload",
+        )
+        log.info(f"Got offload request response code: {res.status_code}")
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is None
+
     def timeline_compact(
         self,
         tenant_id: Union[TenantId, TenantShardId],
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 841707d32e..971cc57a1c 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -6,6 +6,7 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
 )
 from fixtures.pageserver.http import PageserverApiException
+from fixtures.utils import wait_until
 
 
 @pytest.mark.parametrize("shard_count", [0, 4])
@@ -114,3 +115,86 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int):
         leaf_timeline_id,
         state=TimelineArchivalState.UNARCHIVED,
     )
+
+
+@pytest.mark.parametrize("manual_offload", [False, True])
+def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: bool):
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+
+    # Turn off gc and compaction loops: we want to issue them manually for better reliability
+    tenant_id, initial_timeline_id = env.create_tenant(
+        conf={
+            "gc_period": "0s",
+            "compaction_period": "0s" if manual_offload else "1s",
+        }
+    )
+
+    # Create two branches and archive them
+    parent_timeline_id = env.create_branch("test_ancestor_branch_archive_parent", tenant_id)
+    leaf_timeline_id = env.create_branch(
+        "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent"
+    )
+
+    ps_http.timeline_archival_config(
+        tenant_id,
+        leaf_timeline_id,
+        state=TimelineArchivalState.ARCHIVED,
+    )
+    leaf_detail = ps_http.timeline_detail(
+        tenant_id,
+        leaf_timeline_id,
+    )
+    assert leaf_detail["is_archived"] is True
+
+    ps_http.timeline_archival_config(
+        tenant_id,
+        parent_timeline_id,
+        state=TimelineArchivalState.ARCHIVED,
+    )
+
+    def timeline_offloaded(timeline_id: TimelineId) -> bool:
+        return (
+            env.pageserver.log_contains(f".*{timeline_id}.* offloading archived timeline.*")
+            is not None
+        )
+
+    if manual_offload:
+        with pytest.raises(
+            PageserverApiException,
+            match="timeline has attached children",
+        ):
+            # This only tests the (made for testing only) http handler,
+            # but still demonstrates the constraints we have.
+            ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=parent_timeline_id)
+
+    def parent_offloaded():
+        if manual_offload:
+            ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=parent_timeline_id)
+        assert timeline_offloaded(parent_timeline_id)
+
+    def leaf_offloaded():
+        if manual_offload:
+            ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=leaf_timeline_id)
+        assert timeline_offloaded(leaf_timeline_id)
+
+    wait_until(30, 1, leaf_offloaded)
+    wait_until(30, 1, parent_offloaded)
+
+    ps_http.timeline_archival_config(
+        tenant_id,
+        parent_timeline_id,
+        state=TimelineArchivalState.UNARCHIVED,
+    )
+    ps_http.timeline_archival_config(
+        tenant_id,
+        leaf_timeline_id,
+        state=TimelineArchivalState.UNARCHIVED,
+    )
+    leaf_detail = ps_http.timeline_detail(
+        tenant_id,
+        leaf_timeline_id,
+    )
+    assert leaf_detail["is_archived"] is False
+
+    assert not timeline_offloaded(initial_timeline_id)

From d92d36a315f955cd39bc6f6b0948bae25ed195ad Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 15 Oct 2024 13:13:57 +0100
Subject: [PATCH 11/57] [local_proxy] update api for pg_session_jwt (#9359)

pg_session_jwt now:
1. Sets the JWK in a PGU_BACKEND session guc, no longer in the init()
function.
2. JWK no longer needs the kid.
---
 Cargo.lock                              |   7 +-
 Cargo.toml                              |   1 +
 compute/Dockerfile.compute-node         |   4 +-
 proxy/Cargo.toml                        |   3 +-
 proxy/src/serverless/backend.rs         |  49 ++++----
 proxy/src/serverless/local_conn_pool.rs | 143 ++++++++++++++++--------
 workspace_hack/Cargo.toml               |   6 +-
 7 files changed, 139 insertions(+), 74 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5edf5cf7b4..7e772814ec 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2695,6 +2695,7 @@ checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
 dependencies = [
  "equivalent",
  "hashbrown 0.14.5",
+ "serde",
 ]
 
 [[package]]
@@ -2794,9 +2795,9 @@ dependencies = [
 
 [[package]]
 name = "itoa"
-version = "1.0.6"
+version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
+checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
 
 [[package]]
 name = "jobserver"
@@ -4296,6 +4297,7 @@ dependencies = [
  "indexmap 2.0.1",
  "ipnet",
  "itertools 0.10.5",
+ "itoa",
  "jose-jwa",
  "jose-jwk",
  "lasso",
@@ -7307,6 +7309,7 @@ dependencies = [
  "hyper 1.4.1",
  "hyper-util",
  "indexmap 1.9.3",
+ "indexmap 2.0.1",
  "itertools 0.12.1",
  "lazy_static",
  "libc",
diff --git a/Cargo.toml b/Cargo.toml
index dde80f5020..a1a974b33b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -107,6 +107,7 @@ indexmap = "2"
 indoc = "2"
 ipnet = "2.9.0"
 itertools = "0.10"
+itoa = "1.0.11"
 jsonwebtoken = "9"
 lasso = "0.7"
 libc = "0.2"
diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node
index 91528618da..412c64eda4 100644
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -929,8 +929,8 @@ ARG PG_VERSION
 RUN case "${PG_VERSION}" in "v17") \
     echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
     esac && \
-    wget https://github.com/neondatabase/pg_session_jwt/archive/ff0a72440e8ff584dab24b3f9b7c00c56c660b8e.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "1fbb2b5a339263bcf6daa847fad8bccbc0b451cea6a62e6d3bf232b0087f05cb pg_session_jwt.tar.gz" | sha256sum --check && \
+    wget https://github.com/neondatabase/pg_session_jwt/archive/5aee2625af38213650e1a07ae038fdc427250ee4.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "5d91b10bc1347d36cffc456cb87bec25047935d6503dc652ca046f04760828e7 pg_session_jwt.tar.gz" | sha256sum --check && \
     mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 963fb94a7d..e25d2fcbab 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -42,9 +42,10 @@ hyper0.workspace = true
 hyper = { workspace = true, features = ["server", "http1", "http2"] }
 hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
 http-body-util = { version = "0.1" }
-indexmap.workspace = true
+indexmap = { workspace = true, features = ["serde"] }
 ipnet.workspace = true
 itertools.workspace = true
+itoa.workspace = true
 lasso = { workspace = true, features = ["multi-threaded"] }
 measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 2b060af9e1..927854897f 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -2,8 +2,9 @@ use std::{io, sync::Arc, time::Duration};
 
 use async_trait::async_trait;
 use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer};
+use p256::{ecdsa::SigningKey, elliptic_curve::JwkEcKey};
+use rand::rngs::OsRng;
 use tokio::net::{lookup_host, TcpStream};
-use tokio_postgres::types::ToSql;
 use tracing::{debug, field::display, info};
 
 use crate::{
@@ -267,50 +268,58 @@ impl PoolingBackend {
             auth::Backend::Local(local) => local.node_info.clone(),
         };
 
+        let (key, jwk) = create_random_jwk();
+
         let config = node_info
             .config
             .user(&conn_info.user_info.user)
-            .dbname(&conn_info.dbname);
+            .dbname(&conn_info.dbname)
+            .options(&format!(
+                "-c pg_session_jwt.jwk={}",
+                serde_json::to_string(&jwk).expect("serializing jwk to json should not fail")
+            ));
 
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
         drop(pause);
 
-        tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
+        let pid = client.get_process_id();
+        tracing::Span::current().record("pid", pid);
 
-        let handle = local_conn_pool::poll_client(
+        let mut handle = local_conn_pool::poll_client(
             self.local_pool.clone(),
             ctx,
             conn_info,
             client,
             connection,
+            key,
             conn_id,
             node_info.aux.clone(),
         );
 
-        let kid = handle.get_client().get_process_id() as i64;
-        let jwk = p256::PublicKey::from(handle.key().verifying_key()).to_jwk();
+        {
+            let (client, mut discard) = handle.inner();
+            debug!("setting up backend session state");
 
-        debug!(kid, ?jwk, "setting up backend session state");
+            // initiates the auth session
+            if let Err(e) = client.query("select auth.init()", &[]).await {
+                discard.discard();
+                return Err(e.into());
+            }
 
-        // initiates the auth session
-        handle
-            .get_client()
-            .query(
-                "select auth.init($1, $2);",
-                &[
-                    &kid as &(dyn ToSql + Sync),
-                    &tokio_postgres::types::Json(jwk),
-                ],
-            )
-            .await?;
-
-        info!(?kid, "backend session state init");
+            info!("backend session state initialized");
+        }
 
         Ok(handle)
     }
 }
 
+fn create_random_jwk() -> (SigningKey, JwkEcKey) {
+    let key = SigningKey::random(&mut OsRng);
+    let jwk = p256::PublicKey::from(key.verifying_key()).to_jwk();
+    (key, jwk)
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum HttpConnError {
     #[error("pooled connection closed at inconsistent state")]
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index 1dde5952e1..4ab14ad35f 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -1,9 +1,9 @@
 use futures::{future::poll_fn, Future};
+use indexmap::IndexMap;
 use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding};
 use p256::ecdsa::{Signature, SigningKey};
 use parking_lot::RwLock;
-use rand::rngs::OsRng;
-use serde_json::Value;
+use serde_json::value::RawValue;
 use signature::Signer;
 use std::task::{ready, Poll};
 use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration};
@@ -12,14 +12,13 @@ use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::types::ToSql;
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
 use tokio_util::sync::CancellationToken;
-use typed_json::json;
 
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::Metrics;
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
 use crate::{context::RequestMonitoring, DbName, RoleName};
 
-use tracing::{debug, error, warn, Span};
+use tracing::{error, warn, Span};
 use tracing::{info, info_span, Instrument};
 
 use super::backend::HttpConnError;
@@ -245,12 +244,14 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
     }
 }
 
+#[allow(clippy::too_many_arguments)]
 pub(crate) fn poll_client(
     global_pool: Arc<LocalConnPool<tokio_postgres::Client>>,
     ctx: &RequestMonitoring,
     conn_info: ConnInfo,
     client: tokio_postgres::Client,
     mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
+    key: SigningKey,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
 ) -> LocalClient<tokio_postgres::Client> {
@@ -346,8 +347,6 @@ pub(crate) fn poll_client(
     }
     .instrument(span));
 
-    let key = SigningKey::random(&mut OsRng);
-
     let inner = ClientInner {
         inner: client,
         session: tx,
@@ -430,13 +429,6 @@ impl<C: ClientInnerExt> LocalClient<C> {
         let inner = inner.as_mut().expect("client inner should not be removed");
         (&mut inner.inner, Discard { conn_info, pool })
     }
-    pub(crate) fn key(&self) -> &SigningKey {
-        let inner = &self
-            .inner
-            .as_ref()
-            .expect("client inner should not be removed");
-        &inner.key
-    }
 }
 
 impl LocalClient<tokio_postgres::Client> {
@@ -445,25 +437,9 @@ impl LocalClient<tokio_postgres::Client> {
             .inner
             .as_mut()
             .expect("client inner should not be removed");
+
         inner.jti += 1;
-
-        let kid = inner.inner.get_process_id();
-        let header = json!({"kid":kid}).to_string();
-
-        let mut payload = serde_json::from_slice::<serde_json::Map<String, Value>>(payload)
-            .map_err(HttpConnError::JwtPayloadError)?;
-        payload.insert("jti".to_string(), Value::Number(inner.jti.into()));
-        let payload = Value::Object(payload).to_string();
-
-        debug!(
-            kid,
-            jti = inner.jti,
-            ?header,
-            ?payload,
-            "signing new ephemeral JWT"
-        );
-
-        let token = sign_jwt(&inner.key, header, payload);
+        let token = resign_jwt(&inner.key, payload, inner.jti)?;
 
         // initiates the auth session
         inner.inner.simple_query("discard all").await?;
@@ -475,20 +451,74 @@ impl LocalClient<tokio_postgres::Client> {
             )
             .await?;
 
-        info!(kid, jti = inner.jti, "user session state init");
+        let pid = inner.inner.get_process_id();
+        info!(pid, jti = inner.jti, "user session state init");
 
         Ok(())
     }
 }
 
-fn sign_jwt(sk: &SigningKey, header: String, payload: String) -> String {
-    let header = Base64UrlUnpadded::encode_string(header.as_bytes());
-    let payload = Base64UrlUnpadded::encode_string(payload.as_bytes());
+/// implements relatively efficient in-place json object key upserting
+///
+/// only supports top-level keys
+fn upsert_json_object(
+    payload: &[u8],
+    key: &str,
+    value: &RawValue,
+) -> Result<String, serde_json::Error> {
+    let mut payload = serde_json::from_slice::<IndexMap<&str, &RawValue>>(payload)?;
+    payload.insert(key, value);
+    serde_json::to_string(&payload)
+}
 
-    let message = format!("{header}.{payload}");
-    let sig: Signature = sk.sign(message.as_bytes());
-    let base64_sig = Base64UrlUnpadded::encode_string(&sig.to_bytes());
-    format!("{message}.{base64_sig}")
+fn resign_jwt(sk: &SigningKey, payload: &[u8], jti: u64) -> Result<String, HttpConnError> {
+    let mut buffer = itoa::Buffer::new();
+
+    // encode the jti integer to a json rawvalue
+    let jti = serde_json::from_str::<&RawValue>(buffer.format(jti)).unwrap();
+
+    // update the jti in-place
+    let payload =
+        upsert_json_object(payload, "jti", jti).map_err(HttpConnError::JwtPayloadError)?;
+
+    // sign the jwt
+    let token = sign_jwt(sk, payload.as_bytes());
+
+    Ok(token)
+}
+
+fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
+    let header_len = 20;
+    let payload_len = Base64UrlUnpadded::encoded_len(payload);
+    let signature_len = Base64UrlUnpadded::encoded_len(&[0; 64]);
+    let total_len = header_len + payload_len + signature_len + 2;
+
+    let mut jwt = String::with_capacity(total_len);
+    let cap = jwt.capacity();
+
+    // we only need an empty header with the alg specified.
+    // base64url(r#"{"alg":"ES256"}"#) == "eyJhbGciOiJFUzI1NiJ9"
+    jwt.push_str("eyJhbGciOiJFUzI1NiJ9.");
+
+    // encode the jwt payload in-place
+    base64::encode_config_buf(payload, base64::URL_SAFE_NO_PAD, &mut jwt);
+
+    // create the signature from the encoded header || payload
+    let sig: Signature = sk.sign(jwt.as_bytes());
+
+    jwt.push('.');
+
+    // encode the jwt signature in-place
+    base64::encode_config_buf(sig.to_bytes(), base64::URL_SAFE_NO_PAD, &mut jwt);
+
+    debug_assert_eq!(
+        jwt.len(),
+        total_len,
+        "the jwt len should match our expected len"
+    );
+    debug_assert_eq!(jwt.capacity(), cap, "the jwt capacity should not change");
+
+    jwt
 }
 
 impl<C: ClientInnerExt> Discard<'_, C> {
@@ -509,14 +539,6 @@ impl<C: ClientInnerExt> Discard<'_, C> {
 }
 
 impl<C: ClientInnerExt> LocalClient<C> {
-    pub fn get_client(&self) -> &C {
-        &self
-            .inner
-            .as_ref()
-            .expect("client inner should not be removed")
-            .inner
-    }
-
     fn do_drop(&mut self) -> Option<impl FnOnce()> {
         let conn_info = self.conn_info.clone();
         let client = self
@@ -542,3 +564,30 @@ impl<C: ClientInnerExt> Drop for LocalClient<C> {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use p256::ecdsa::SigningKey;
+    use typed_json::json;
+
+    use super::resign_jwt;
+
+    #[test]
+    fn jwt_token_snapshot() {
+        let key = SigningKey::from_bytes(&[1; 32].into()).unwrap();
+        let data =
+            json!({"foo":"bar","jti":"foo\nbar","nested":{"jti":"tricky nesting"}}).to_string();
+
+        let jwt = resign_jwt(&key, data.as_bytes(), 2).unwrap();
+
+        // To validate the JWT, copy the JWT string and paste it into https://jwt.io/.
+        // In the public-key box, paste the following jwk public key
+        // `{"kty":"EC","crv":"P-256","x":"b_A7lJJBzh2t1DUZ5pYOCoW0GmmgXDKBA6orzhWUyhY","y":"PE91OlW_AdxT9sCwx-7ni0DG_30lqW4igrmJzvccFEo"}`
+
+        // let pub_key = p256::ecdsa::VerifyingKey::from(&key);
+        // let pub_key = p256::PublicKey::from(pub_key);
+        // println!("{}", pub_key.to_jwk_string());
+
+        assert_eq!(jwt, "eyJhbGciOiJFUzI1NiJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.pYf0LxoJ8sDgpmsYOgrbNecOSipnPBEGwnZzB-JhW2cONrKlqRsgXwK8_cOsyolGy-hTTe8GXbWTl_UdpF5RyA");
+    }
+}
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 0a90b6b6f7..1347d6ddff 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -46,7 +46,8 @@ hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["full"] }
 hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] }
 hyper-util = { version = "0.1", features = ["client-legacy", "server-auto", "service"] }
-indexmap = { version = "1", default-features = false, features = ["std"] }
+indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] }
+indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] }
 itertools = { version = "0.12" }
 lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
@@ -101,7 +102,8 @@ either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 half = { version = "2", default-features = false, features = ["num-traits"] }
 hashbrown = { version = "0.14", features = ["raw"] }
-indexmap = { version = "1", default-features = false, features = ["std"] }
+indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] }
+indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] }
 itertools = { version = "0.12" }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }

From fb74c21e8cae23831b7728232772315297463e63 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 15 Oct 2024 15:24:56 +0200
Subject: [PATCH 12/57] proxy: Migrate jwt module away from anyhow (#9361)

---
 proxy/src/auth/backend/jwt.rs            | 188 +++++++++++++++++------
 proxy/src/auth/backend/local.rs          |   6 +-
 proxy/src/auth/backend/mod.rs            |   3 +-
 proxy/src/control_plane/provider/mock.rs |  10 +-
 proxy/src/control_plane/provider/mod.rs  |  43 +++++-
 proxy/src/control_plane/provider/neon.rs |  27 ++--
 proxy/src/proxy/tests/mod.rs             |  42 ++---
 proxy/src/proxy/wake_compute.rs          |   2 +-
 8 files changed, 228 insertions(+), 93 deletions(-)

diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 17ab7eda22..402e59fdb3 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -4,21 +4,20 @@ use std::{
     time::{Duration, SystemTime},
 };
 
-use anyhow::{bail, ensure, Context};
 use arc_swap::ArcSwapOption;
 use dashmap::DashMap;
 use jose_jwk::crypto::KeyInfo;
 use serde::{de::Visitor, Deserialize, Deserializer};
 use signature::Verifier;
+use thiserror::Error;
 use tokio::time::Instant;
 
 use crate::{
-    context::RequestMonitoring, http::parse_json_body_with_limit, intern::RoleNameInt, EndpointId,
-    RoleName,
+    auth::backend::ComputeCredentialKeys, context::RequestMonitoring,
+    control_plane::errors::GetEndpointJwksError, http::parse_json_body_with_limit,
+    intern::RoleNameInt, EndpointId, RoleName,
 };
 
-use super::ComputeCredentialKeys;
-
 // TODO(conrad): make these configurable.
 const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30);
 const MIN_RENEW: Duration = Duration::from_secs(30);
@@ -32,7 +31,16 @@ pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
         &self,
         ctx: &RequestMonitoring,
         endpoint: EndpointId,
-    ) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
+    ) -> impl Future<Output = Result<Vec<AuthRule>, FetchAuthRulesError>> + Send;
+}
+
+#[derive(Error, Debug)]
+pub(crate) enum FetchAuthRulesError {
+    #[error(transparent)]
+    GetEndpointJwks(#[from] GetEndpointJwksError),
+
+    #[error("JWKs settings for this role were not configured")]
+    RoleJwksNotConfigured,
 }
 
 pub(crate) struct AuthRule {
@@ -122,7 +130,7 @@ impl JwkCacheEntryLock {
         client: &reqwest::Client,
         endpoint: EndpointId,
         auth_rules: &F,
-    ) -> anyhow::Result<Arc<JwkCacheEntry>> {
+    ) -> Result<Arc<JwkCacheEntry>, JwtError> {
         // double check that no one beat us to updating the cache.
         let now = Instant::now();
         let guard = self.cached.load_full();
@@ -188,7 +196,7 @@ impl JwkCacheEntryLock {
         client: &reqwest::Client,
         endpoint: EndpointId,
         fetch: &F,
-    ) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
+    ) -> Result<Arc<JwkCacheEntry>, JwtError> {
         let now = Instant::now();
         let guard = self.cached.load_full();
 
@@ -243,27 +251,24 @@ impl JwkCacheEntryLock {
         endpoint: EndpointId,
         role_name: &RoleName,
         fetch: &F,
-    ) -> Result<ComputeCredentialKeys, anyhow::Error> {
+    ) -> Result<ComputeCredentialKeys, JwtError> {
         // JWT compact form is defined to be
         // <B64(Header)> || . || <B64(Payload)> || . || <B64(Signature)>
         // where Signature = alg(<B64(Header)> || . || <B64(Payload)>);
 
         let (header_payload, signature) = jwt
             .rsplit_once('.')
-            .context("Provided authentication token is not a valid JWT encoding")?;
+            .ok_or(JwtEncodingError::InvalidCompactForm)?;
         let (header, payload) = header_payload
             .split_once('.')
-            .context("Provided authentication token is not a valid JWT encoding")?;
+            .ok_or(JwtEncodingError::InvalidCompactForm)?;
 
-        let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
-            .context("Provided authentication token is not a valid JWT encoding")?;
-        let header = serde_json::from_slice::<JwtHeader<'_>>(&header)
-            .context("Provided authentication token is not a valid JWT encoding")?;
+        let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)?;
+        let header = serde_json::from_slice::<JwtHeader<'_>>(&header)?;
 
-        let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
-            .context("Provided authentication token is not a valid JWT encoding")?;
+        let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)?;
 
-        let kid = header.key_id.context("missing key id")?;
+        let kid = header.key_id.ok_or(JwtError::MissingKeyId)?;
 
         let mut guard = self
             .get_or_update_jwk_cache(ctx, client, endpoint.clone(), fetch)
@@ -281,16 +286,13 @@ impl JwkCacheEntryLock {
                         .renew_jwks(permit, ctx, client, endpoint.clone(), fetch)
                         .await?;
                 }
-                _ => {
-                    bail!("jwk not found");
-                }
+                _ => return Err(JwtError::JwkNotFound),
             }
         };
 
-        ensure!(
-            jwk.is_supported(&header.algorithm),
-            "signature algorithm not supported"
-        );
+        if !jwk.is_supported(&header.algorithm) {
+            return Err(JwtError::SignatureAlgorithmNotSupported);
+        }
 
         match &jwk.key {
             jose_jwk::Key::Ec(key) => {
@@ -299,34 +301,32 @@ impl JwkCacheEntryLock {
             jose_jwk::Key::Rsa(key) => {
                 verify_rsa_signature(header_payload.as_bytes(), &sig, key, &header.algorithm)?;
             }
-            key => bail!("unsupported key type {key:?}"),
+            key => return Err(JwtError::UnsupportedKeyType(key.into())),
         };
 
-        let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)
-            .context("Provided authentication token is not a valid JWT encoding")?;
-        let payload = serde_json::from_slice::<JwtPayload<'_>>(&payloadb)
-            .context("Provided authentication token is not a valid JWT encoding")?;
+        let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)?;
+        let payload = serde_json::from_slice::<JwtPayload<'_>>(&payloadb)?;
 
         tracing::debug!(?payload, "JWT signature valid with claims");
 
         if let Some(aud) = expected_audience {
-            ensure!(
-                payload.audience.0.iter().any(|s| s == aud),
-                "invalid JWT token audience"
-            );
+            if payload.audience.0.iter().all(|s| s != aud) {
+                return Err(JwtError::InvalidJwtTokenAudience);
+            }
         }
 
         let now = SystemTime::now();
 
         if let Some(exp) = payload.expiration {
-            ensure!(now < exp + CLOCK_SKEW_LEEWAY, "JWT token has expired");
+            if now >= exp + CLOCK_SKEW_LEEWAY {
+                return Err(JwtError::JwtTokenHasExpired);
+            }
         }
 
         if let Some(nbf) = payload.not_before {
-            ensure!(
-                nbf < now + CLOCK_SKEW_LEEWAY,
-                "JWT token is not yet ready to use"
-            );
+            if nbf >= now + CLOCK_SKEW_LEEWAY {
+                return Err(JwtError::JwtTokenNotYetReadyToUse);
+            }
         }
 
         Ok(ComputeCredentialKeys::JwtPayload(payloadb))
@@ -341,7 +341,7 @@ impl JwkCache {
         role_name: &RoleName,
         fetch: &F,
         jwt: &str,
-    ) -> Result<ComputeCredentialKeys, anyhow::Error> {
+    ) -> Result<ComputeCredentialKeys, JwtError> {
         // try with just a read lock first
         let key = (endpoint.clone(), role_name.clone());
         let entry = self.map.get(&key).as_deref().map(Arc::clone);
@@ -357,19 +357,18 @@ impl JwkCache {
     }
 }
 
-fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> anyhow::Result<()> {
+fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> Result<(), JwtError> {
     use ecdsa::Signature;
     use signature::Verifier;
 
     match key.crv {
         jose_jwk::EcCurves::P256 => {
-            let pk =
-                p256::PublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid P256 key"))?;
+            let pk = p256::PublicKey::try_from(key).map_err(JwtError::InvalidP256Key)?;
             let key = p256::ecdsa::VerifyingKey::from(&pk);
             let sig = Signature::from_slice(sig)?;
             key.verify(data, &sig)?;
         }
-        key => bail!("unsupported ec key type {key:?}"),
+        key => return Err(JwtError::UnsupportedEcKeyType(key)),
     }
 
     Ok(())
@@ -380,14 +379,14 @@ fn verify_rsa_signature(
     sig: &[u8],
     key: &jose_jwk::Rsa,
     alg: &jose_jwa::Algorithm,
-) -> anyhow::Result<()> {
+) -> Result<(), JwtError> {
     use jose_jwa::{Algorithm, Signing};
     use rsa::{
         pkcs1v15::{Signature, VerifyingKey},
         RsaPublicKey,
     };
 
-    let key = RsaPublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid RSA key"))?;
+    let key = RsaPublicKey::try_from(key).map_err(JwtError::InvalidRsaKey)?;
 
     match alg {
         Algorithm::Signing(Signing::Rs256) => {
@@ -395,7 +394,7 @@ fn verify_rsa_signature(
             let sig = Signature::try_from(sig)?;
             key.verify(data, &sig)?;
         }
-        _ => bail!("invalid RSA signing algorithm"),
+        _ => return Err(JwtError::InvalidRsaSigningAlgorithm),
     };
 
     Ok(())
@@ -561,6 +560,99 @@ impl Drop for JwkRenewalPermit<'_> {
     }
 }
 
+#[derive(Error, Debug)]
+#[non_exhaustive]
+pub(crate) enum JwtError {
+    #[error("jwk not found")]
+    JwkNotFound,
+
+    #[error("missing key id")]
+    MissingKeyId,
+
+    #[error("Provided authentication token is not a valid JWT encoding")]
+    JwtEncoding(#[from] JwtEncodingError),
+
+    #[error("invalid JWT token audience")]
+    InvalidJwtTokenAudience,
+
+    #[error("JWT token has expired")]
+    JwtTokenHasExpired,
+
+    #[error("JWT token is not yet ready to use")]
+    JwtTokenNotYetReadyToUse,
+
+    #[error("invalid P256 key")]
+    InvalidP256Key(jose_jwk::crypto::Error),
+
+    #[error("invalid RSA key")]
+    InvalidRsaKey(jose_jwk::crypto::Error),
+
+    #[error("invalid RSA signing algorithm")]
+    InvalidRsaSigningAlgorithm,
+
+    #[error("unsupported EC key type {0:?}")]
+    UnsupportedEcKeyType(jose_jwk::EcCurves),
+
+    #[error("unsupported key type {0:?}")]
+    UnsupportedKeyType(KeyType),
+
+    #[error("signature algorithm not supported")]
+    SignatureAlgorithmNotSupported,
+
+    #[error("signature error: {0}")]
+    Signature(#[from] signature::Error),
+
+    #[error("failed to fetch auth rules: {0}")]
+    FetchAuthRules(#[from] FetchAuthRulesError),
+}
+
+impl From<base64::DecodeError> for JwtError {
+    fn from(err: base64::DecodeError) -> Self {
+        JwtEncodingError::Base64Decode(err).into()
+    }
+}
+
+impl From<serde_json::Error> for JwtError {
+    fn from(err: serde_json::Error) -> Self {
+        JwtEncodingError::SerdeJson(err).into()
+    }
+}
+
+#[derive(Error, Debug)]
+#[non_exhaustive]
+pub enum JwtEncodingError {
+    #[error(transparent)]
+    Base64Decode(#[from] base64::DecodeError),
+
+    #[error(transparent)]
+    SerdeJson(#[from] serde_json::Error),
+
+    #[error("invalid compact form")]
+    InvalidCompactForm,
+}
+
+#[allow(dead_code, reason = "Debug use only")]
+#[derive(Debug)]
+pub(crate) enum KeyType {
+    Ec(jose_jwk::EcCurves),
+    Rsa,
+    Oct,
+    Okp(jose_jwk::OkpCurves),
+    Unknown,
+}
+
+impl From<&jose_jwk::Key> for KeyType {
+    fn from(key: &jose_jwk::Key) -> Self {
+        match key {
+            jose_jwk::Key::Ec(ec) => Self::Ec(ec.crv),
+            jose_jwk::Key::Rsa(_rsa) => Self::Rsa,
+            jose_jwk::Key::Oct(_oct) => Self::Oct,
+            jose_jwk::Key::Okp(okp) => Self::Okp(okp.crv),
+            _ => Self::Unknown,
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use crate::RoleName;
@@ -758,7 +850,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
                 &self,
                 _ctx: &RequestMonitoring,
                 _endpoint: EndpointId,
-            ) -> anyhow::Result<Vec<AuthRule>> {
+            ) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
                 Ok(vec![
                     AuthRule {
                         id: "foo".to_owned(),
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index 12451847b1..1dea4d2d73 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -1,9 +1,9 @@
 use std::net::SocketAddr;
 
-use anyhow::Context;
 use arc_swap::ArcSwapOption;
 
 use crate::{
+    auth::backend::jwt::FetchAuthRulesError,
     compute::ConnCfg,
     context::RequestMonitoring,
     control_plane::{
@@ -53,11 +53,11 @@ impl FetchAuthRules for StaticAuthRules {
         &self,
         _ctx: &RequestMonitoring,
         _endpoint: EndpointId,
-    ) -> anyhow::Result<Vec<AuthRule>> {
+    ) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
         let mappings = JWKS_ROLE_MAP.load();
         let role_mappings = mappings
             .as_deref()
-            .context("JWKs settings for this role were not configured")?;
+            .ok_or(FetchAuthRulesError::RoleJwksNotConfigured)?;
         let mut rules = vec![];
         for setting in &role_mappings.jwks {
             rules.push(AuthRule {
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 96e1a787ed..7cf158bcd9 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -561,7 +561,8 @@ mod tests {
             &self,
             _ctx: &RequestMonitoring,
             _endpoint: crate::EndpointId,
-        ) -> anyhow::Result<Vec<super::jwt::AuthRule>> {
+        ) -> Result<Vec<super::jwt::AuthRule>, control_plane::errors::GetEndpointJwksError>
+        {
             unimplemented!()
         }
 
diff --git a/proxy/src/control_plane/provider/mock.rs b/proxy/src/control_plane/provider/mock.rs
index ea2eb79e2a..51cddec672 100644
--- a/proxy/src/control_plane/provider/mock.rs
+++ b/proxy/src/control_plane/provider/mock.rs
@@ -5,7 +5,8 @@ use super::{
     AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
 };
 use crate::{
-    auth::backend::jwt::AuthRule, context::RequestMonitoring, intern::RoleNameInt, RoleName,
+    auth::backend::jwt::AuthRule, context::RequestMonitoring,
+    control_plane::errors::GetEndpointJwksError, intern::RoleNameInt, RoleName,
 };
 use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
 use crate::{auth::IpPattern, cache::Cached};
@@ -120,7 +121,10 @@ impl Api {
         })
     }
 
-    async fn do_get_endpoint_jwks(&self, endpoint: EndpointId) -> anyhow::Result<Vec<AuthRule>> {
+    async fn do_get_endpoint_jwks(
+        &self,
+        endpoint: EndpointId,
+    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
         let (client, connection) =
             tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;
 
@@ -224,7 +228,7 @@ impl super::Api for Api {
         &self,
         _ctx: &RequestMonitoring,
         endpoint: EndpointId,
-    ) -> anyhow::Result<Vec<AuthRule>> {
+    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
         self.do_get_endpoint_jwks(endpoint).await
     }
 
diff --git a/proxy/src/control_plane/provider/mod.rs b/proxy/src/control_plane/provider/mod.rs
index 6cc525a324..0a196fe2a3 100644
--- a/proxy/src/control_plane/provider/mod.rs
+++ b/proxy/src/control_plane/provider/mod.rs
@@ -6,7 +6,7 @@ use super::messages::{ControlPlaneError, MetricsAuxInfo};
 use crate::{
     auth::{
         backend::{
-            jwt::{AuthRule, FetchAuthRules},
+            jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError},
             ComputeCredentialKeys, ComputeUserInfo,
         },
         IpPattern,
@@ -44,7 +44,7 @@ pub(crate) mod errors {
     pub(crate) enum ApiError {
         /// Error returned by the console itself.
         #[error("{REQUEST_FAILED} with {0}")]
-        ControlPlane(ControlPlaneError),
+        ControlPlane(Box<ControlPlaneError>),
 
         /// Various IO errors like broken pipe or malformed payload.
         #[error("{REQUEST_FAILED}: {0}")]
@@ -90,7 +90,7 @@ pub(crate) mod errors {
                     Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
                     Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
                     Reason::RunningOperations => ErrorKind::ControlPlane,
-                    Reason::Unknown => match &e {
+                    Reason::Unknown => match &**e {
                         ControlPlaneError {
                             http_status_code:
                                 http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
@@ -246,6 +246,33 @@ pub(crate) mod errors {
             }
         }
     }
+
+    #[derive(Debug, Error)]
+    pub enum GetEndpointJwksError {
+        #[error("endpoint not found")]
+        EndpointNotFound,
+
+        #[error("failed to build control plane request: {0}")]
+        RequestBuild(#[source] reqwest::Error),
+
+        #[error("failed to send control plane request: {0}")]
+        RequestExecute(#[source] reqwest_middleware::Error),
+
+        #[error(transparent)]
+        ControlPlane(#[from] ApiError),
+
+        #[cfg(any(test, feature = "testing"))]
+        #[error(transparent)]
+        TokioPostgres(#[from] tokio_postgres::Error),
+
+        #[cfg(any(test, feature = "testing"))]
+        #[error(transparent)]
+        ParseUrl(#[from] url::ParseError),
+
+        #[cfg(any(test, feature = "testing"))]
+        #[error(transparent)]
+        TaskJoin(#[from] tokio::task::JoinError),
+    }
 }
 
 /// Auth secret which is managed by the cloud.
@@ -342,7 +369,7 @@ pub(crate) trait Api {
         &self,
         ctx: &RequestMonitoring,
         endpoint: EndpointId,
-    ) -> anyhow::Result<Vec<AuthRule>>;
+    ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError>;
 
     /// Wake up the compute node and return the corresponding connection info.
     async fn wake_compute(
@@ -401,7 +428,7 @@ impl Api for ControlPlaneBackend {
         &self,
         ctx: &RequestMonitoring,
         endpoint: EndpointId,
-    ) -> anyhow::Result<Vec<AuthRule>> {
+    ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
         match self {
             Self::Management(api) => api.get_endpoint_jwks(ctx, endpoint).await,
             #[cfg(any(test, feature = "testing"))]
@@ -583,7 +610,9 @@ impl FetchAuthRules for ControlPlaneBackend {
         &self,
         ctx: &RequestMonitoring,
         endpoint: EndpointId,
-    ) -> anyhow::Result<Vec<AuthRule>> {
-        self.get_endpoint_jwks(ctx, endpoint).await
+    ) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
+        self.get_endpoint_jwks(ctx, endpoint)
+            .await
+            .map_err(FetchAuthRulesError::GetEndpointJwks)
     }
 }
diff --git a/proxy/src/control_plane/provider/neon.rs b/proxy/src/control_plane/provider/neon.rs
index d01878741c..2487ce0e3f 100644
--- a/proxy/src/control_plane/provider/neon.rs
+++ b/proxy/src/control_plane/provider/neon.rs
@@ -9,7 +9,10 @@ use super::{
 use crate::{
     auth::backend::{jwt::AuthRule, ComputeUserInfo},
     compute,
-    control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason},
+    control_plane::{
+        errors::GetEndpointJwksError,
+        messages::{ColdStartInfo, EndpointJwksResponse, Reason},
+    },
     http,
     metrics::{CacheOutcome, Metrics},
     rate_limiter::WakeComputeRateLimiter,
@@ -17,7 +20,6 @@ use crate::{
 };
 use crate::{cache::Cached, context::RequestMonitoring};
 use ::http::{header::AUTHORIZATION, HeaderName};
-use anyhow::bail;
 use futures::TryFutureExt;
 use std::{sync::Arc, time::Duration};
 use tokio::time::Instant;
@@ -137,14 +139,14 @@ impl Api {
         &self,
         ctx: &RequestMonitoring,
         endpoint: EndpointId,
-    ) -> anyhow::Result<Vec<AuthRule>> {
+    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
         if !self
             .caches
             .endpoints_cache
             .is_valid(ctx, &endpoint.normalize())
             .await
         {
-            bail!("endpoint not found");
+            return Err(GetEndpointJwksError::EndpointNotFound);
         }
         let request_id = ctx.session_id().to_string();
         async {
@@ -159,12 +161,17 @@ impl Api {
                 .header(X_REQUEST_ID, &request_id)
                 .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
                 .query(&[("session_id", ctx.session_id())])
-                .build()?;
+                .build()
+                .map_err(GetEndpointJwksError::RequestBuild)?;
 
             info!(url = request.url().as_str(), "sending http request");
             let start = Instant::now();
             let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
-            let response = self.endpoint.execute(request).await?;
+            let response = self
+                .endpoint
+                .execute(request)
+                .await
+                .map_err(GetEndpointJwksError::RequestExecute)?;
             drop(pause);
             info!(duration = ?start.elapsed(), "received http response");
 
@@ -330,7 +337,7 @@ impl super::Api for Api {
         &self,
         ctx: &RequestMonitoring,
         endpoint: EndpointId,
-    ) -> anyhow::Result<Vec<AuthRule>> {
+    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
         self.do_get_endpoint_jwks(ctx, endpoint).await
     }
 
@@ -348,7 +355,7 @@ impl super::Api for Api {
                     let (cached, info) = cached.take_value();
                     let info = info.map_err(|c| {
                         info!(key = &*key, "found cached wake_compute error");
-                        WakeComputeError::ApiError(ApiError::ControlPlane(*c))
+                        WakeComputeError::ApiError(ApiError::ControlPlane(Box::new(*c)))
                     })?;
 
                     debug!(key = &*key, "found cached compute node info");
@@ -418,7 +425,7 @@ impl super::Api for Api {
 
                     self.caches.node_info.insert_ttl(
                         key,
-                        Err(Box::new(err.clone())),
+                        Err(err.clone()),
                         Duration::from_secs(30),
                     );
 
@@ -457,7 +464,7 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
     body.http_status_code = status;
 
     warn!("console responded with an error ({status}): {body:?}");
-    Err(ApiError::ControlPlane(body))
+    Err(ApiError::ControlPlane(Box::new(body)))
 }
 
 fn parse_host_port(input: &str) -> Option<(&str, u16)> {
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 58fb36dba7..deb4d4a63f 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -492,30 +492,32 @@ impl TestBackend for TestConnectMechanism {
         match action {
             ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)),
             ConnectAction::WakeFail => {
-                let err = control_plane::errors::ApiError::ControlPlane(ControlPlaneError {
-                    http_status_code: StatusCode::BAD_REQUEST,
-                    error: "TEST".into(),
-                    status: None,
-                });
+                let err =
+                    control_plane::errors::ApiError::ControlPlane(Box::new(ControlPlaneError {
+                        http_status_code: StatusCode::BAD_REQUEST,
+                        error: "TEST".into(),
+                        status: None,
+                    }));
                 assert!(!err.could_retry());
                 Err(control_plane::errors::WakeComputeError::ApiError(err))
             }
             ConnectAction::WakeRetry => {
-                let err = control_plane::errors::ApiError::ControlPlane(ControlPlaneError {
-                    http_status_code: StatusCode::BAD_REQUEST,
-                    error: "TEST".into(),
-                    status: Some(Status {
-                        code: "error".into(),
-                        message: "error".into(),
-                        details: Details {
-                            error_info: None,
-                            retry_info: Some(control_plane::messages::RetryInfo {
-                                retry_delay_ms: 1,
-                            }),
-                            user_facing_message: None,
-                        },
-                    }),
-                });
+                let err =
+                    control_plane::errors::ApiError::ControlPlane(Box::new(ControlPlaneError {
+                        http_status_code: StatusCode::BAD_REQUEST,
+                        error: "TEST".into(),
+                        status: Some(Status {
+                            code: "error".into(),
+                            message: "error".into(),
+                            details: Details {
+                                error_info: None,
+                                retry_info: Some(control_plane::messages::RetryInfo {
+                                    retry_delay_ms: 1,
+                                }),
+                                user_facing_message: None,
+                            },
+                        }),
+                    }));
                 assert!(err.could_retry());
                 Err(control_plane::errors::WakeComputeError::ApiError(err))
             }
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index ba674f5d0d..0d1527a2c1 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -79,7 +79,7 @@ fn report_error(e: &WakeComputeError, retry: bool) {
             Reason::ConcurrencyLimitReached => WakeupFailureKind::ApiConsoleLocked,
             Reason::LockAlreadyTaken => WakeupFailureKind::ApiConsoleLocked,
             Reason::RunningOperations => WakeupFailureKind::ApiConsoleLocked,
-            Reason::Unknown => match e {
+            Reason::Unknown => match **e {
                 ControlPlaneError {
                     http_status_code: StatusCode::LOCKED,
                     ref error,

From 614c3aef72ed595190801e8d77fe188e3cb13605 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 15 Oct 2024 17:18:52 +0300
Subject: [PATCH 13/57] Remove redundant code (#9373)

## Problem

There is double update of resize cache in `put_rel_truncation`
Also `page_server_request` contains check that fork is MAIN_FORKNUM
which
1. is incorrect (because Vm/FSM pages are shreded in the same way as
MAIN fork pages and
2. is redundant because `page_server_request` is never called for `get
page` request so first part to OR condition is always true.

## Summary of changes

Remove redundant code

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs | 3 ---
 pgxn/neon/pagestore_smgr.c          | 3 +--
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 7aa313f031..900da5beab 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1545,9 +1545,6 @@ impl<'a> DatadirModification<'a> {
             // Update relation size cache
             self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
 
-            // Update relation size cache
-            self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
-
             // Update logical database size.
             self.pending_nblocks -= old_size as i64 - nblocks as i64;
         }
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index f46df7f70a..cbb0e2ae6d 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1092,8 +1092,7 @@ page_server_request(void const *req)
 	 * Current sharding model assumes that all metadata is present only at shard 0.
 	 * We still need to call get_shard_no() to check if shard map is up-to-date.
 	 */
-	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest ||
-		((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
+	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest)
 	{
 		shard_no = 0;
 	}

From cf7a596a151487c1b3afafbe1eb2efab895326ea Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 15 Oct 2024 11:18:38 -0500
Subject: [PATCH 14/57] Generate sql_exporter config files with Jsonnet

There are quite a few benefits to this approach:

- Reduce config duplication
  - The two sql_exporter configs were super similar with just a few
    differences
- Pull SQL queries into standalone files
  - That means we could run a SQL formatter on the file in the future
  - It also means access to syntax highlighting
- In the future, run different queries for different PG versions
  - This is relevant because right now, we have queries that are failing
    on PG 17 due to catalog updates

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 .github/workflows/build_and_test.yml          |  19 +
 Dockerfile.build-tools                        |   1 +
 Makefile                                      |   1 +
 compute/.gitignore                            |   5 +
 compute/Dockerfile.compute-node               |  22 +-
 compute/Makefile                              |  35 ++
 compute/etc/README.md                         |  17 +
 compute/etc/neon_collector.jsonnet            |  43 +++
 compute/etc/neon_collector.yml                | 331 ------------------
 .../etc/neon_collector_autoscaling.jsonnet    |  11 +
 compute/etc/neon_collector_autoscaling.yml    |  55 ---
 compute/etc/sql_exporter.jsonnet              |  40 +++
 compute/etc/sql_exporter.yml                  |  33 --
 .../sql_exporter/checkpoints_req.libsonnet    |  10 +
 compute/etc/sql_exporter/checkpoints_req.sql  |   1 +
 .../sql_exporter/checkpoints_timed.libsonnet  |  10 +
 .../etc/sql_exporter/checkpoints_timed.sql    |   1 +
 .../compute_current_lsn.libsonnet             |  10 +
 .../etc/sql_exporter/compute_current_lsn.sql  |   4 +
 .../compute_logical_snapshot_files.libsonnet  |  12 +
 .../compute_logical_snapshot_files.sql        |   7 +
 .../compute_receive_lsn.libsonnet             |  10 +
 .../etc/sql_exporter/compute_receive_lsn.sql  |   4 +
 .../compute_subscriptions_count.libsonnet     |  12 +
 .../compute_subscriptions_count.sql           |   1 +
 .../sql_exporter/connection_counts.libsonnet  |  13 +
 .../etc/sql_exporter/connection_counts.sql    |   1 +
 .../etc/sql_exporter/db_total_size.libsonnet  |  10 +
 compute/etc/sql_exporter/db_total_size.sql    |   1 +
 .../getpage_prefetch_discards_total.libsonnet |   9 +
 .../getpage_prefetch_misses_total.libsonnet   |   9 +
 .../getpage_prefetch_requests_total.libsonnet |   9 +
 .../getpage_sync_requests_total.libsonnet     |   9 +
 .../getpage_wait_seconds_bucket.libsonnet     |  12 +
 .../getpage_wait_seconds_bucket.sql           |   1 +
 .../getpage_wait_seconds_count.libsonnet      |   9 +
 .../getpage_wait_seconds_sum.libsonnet        |   9 +
 ...lfc_approximate_working_set_size.libsonnet |  12 +
 .../lfc_approximate_working_set_size.sql      |   1 +
 ...ing_set_size_windows.autoscaling.libsonnet |  12 +
 ...e_working_set_size_windows.autoscaling.sql |   8 +
 ...oximate_working_set_size_windows.libsonnet |  12 +
 ...c_approximate_working_set_size_windows.sql |   8 +
 .../lfc_cache_size_limit.libsonnet            |  10 +
 .../etc/sql_exporter/lfc_cache_size_limit.sql |   1 +
 compute/etc/sql_exporter/lfc_hits.libsonnet   |  10 +
 compute/etc/sql_exporter/lfc_hits.sql         |   1 +
 compute/etc/sql_exporter/lfc_misses.libsonnet |  10 +
 compute/etc/sql_exporter/lfc_misses.sql       |   1 +
 compute/etc/sql_exporter/lfc_used.libsonnet   |  10 +
 compute/etc/sql_exporter/lfc_used.sql         |   1 +
 compute/etc/sql_exporter/lfc_writes.libsonnet |  10 +
 compute/etc/sql_exporter/lfc_writes.sql       |   1 +
 .../logical_slot_restart_lsn.libsonnet        |  15 +
 .../sql_exporter/logical_slot_restart_lsn.sql |   3 +
 .../sql_exporter/max_cluster_size.libsonnet   |  10 +
 compute/etc/sql_exporter/max_cluster_size.sql |   1 +
 .../etc/sql_exporter/neon_perf_counters.sql   |  13 +
 .../pageserver_disconnects_total.libsonnet    |   9 +
 .../pageserver_requests_sent_total.libsonnet  |   9 +
 .../pageserver_send_flushes_total.libsonnet   |   9 +
 .../sql_exporter/pg_stats_userdb.libsonnet    |  18 +
 compute/etc/sql_exporter/pg_stats_userdb.sql  |  10 +
 .../replication_delay_bytes.libsonnet         |  10 +
 .../sql_exporter/replication_delay_bytes.sql  |   6 +
 .../replication_delay_seconds.libsonnet       |  10 +
 .../replication_delay_seconds.sql             |   5 +
 .../etc/sql_exporter/retained_wal.libsonnet   |  12 +
 compute/etc/sql_exporter/retained_wal.sql     |   5 +
 .../etc/sql_exporter/wal_is_lost.libsonnet    |  12 +
 compute/etc/sql_exporter/wal_is_lost.sql      |   7 +
 compute/etc/sql_exporter_autoscaling.yml      |  33 --
 72 files changed, 635 insertions(+), 457 deletions(-)
 create mode 100644 compute/.gitignore
 create mode 100644 compute/Makefile
 create mode 100644 compute/etc/README.md
 create mode 100644 compute/etc/neon_collector.jsonnet
 delete mode 100644 compute/etc/neon_collector.yml
 create mode 100644 compute/etc/neon_collector_autoscaling.jsonnet
 delete mode 100644 compute/etc/neon_collector_autoscaling.yml
 create mode 100644 compute/etc/sql_exporter.jsonnet
 delete mode 100644 compute/etc/sql_exporter.yml
 create mode 100644 compute/etc/sql_exporter/checkpoints_req.libsonnet
 create mode 100644 compute/etc/sql_exporter/checkpoints_req.sql
 create mode 100644 compute/etc/sql_exporter/checkpoints_timed.libsonnet
 create mode 100644 compute/etc/sql_exporter/checkpoints_timed.sql
 create mode 100644 compute/etc/sql_exporter/compute_current_lsn.libsonnet
 create mode 100644 compute/etc/sql_exporter/compute_current_lsn.sql
 create mode 100644 compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet
 create mode 100644 compute/etc/sql_exporter/compute_logical_snapshot_files.sql
 create mode 100644 compute/etc/sql_exporter/compute_receive_lsn.libsonnet
 create mode 100644 compute/etc/sql_exporter/compute_receive_lsn.sql
 create mode 100644 compute/etc/sql_exporter/compute_subscriptions_count.libsonnet
 create mode 100644 compute/etc/sql_exporter/compute_subscriptions_count.sql
 create mode 100644 compute/etc/sql_exporter/connection_counts.libsonnet
 create mode 100644 compute/etc/sql_exporter/connection_counts.sql
 create mode 100644 compute/etc/sql_exporter/db_total_size.libsonnet
 create mode 100644 compute/etc/sql_exporter/db_total_size.sql
 create mode 100644 compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet
 create mode 100644 compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet
 create mode 100644 compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet
 create mode 100644 compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet
 create mode 100644 compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet
 create mode 100644 compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql
 create mode 100644 compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet
 create mode 100644 compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet
 create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet
 create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size.sql
 create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet
 create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql
 create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet
 create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql
 create mode 100644 compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet
 create mode 100644 compute/etc/sql_exporter/lfc_cache_size_limit.sql
 create mode 100644 compute/etc/sql_exporter/lfc_hits.libsonnet
 create mode 100644 compute/etc/sql_exporter/lfc_hits.sql
 create mode 100644 compute/etc/sql_exporter/lfc_misses.libsonnet
 create mode 100644 compute/etc/sql_exporter/lfc_misses.sql
 create mode 100644 compute/etc/sql_exporter/lfc_used.libsonnet
 create mode 100644 compute/etc/sql_exporter/lfc_used.sql
 create mode 100644 compute/etc/sql_exporter/lfc_writes.libsonnet
 create mode 100644 compute/etc/sql_exporter/lfc_writes.sql
 create mode 100644 compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet
 create mode 100644 compute/etc/sql_exporter/logical_slot_restart_lsn.sql
 create mode 100644 compute/etc/sql_exporter/max_cluster_size.libsonnet
 create mode 100644 compute/etc/sql_exporter/max_cluster_size.sql
 create mode 100644 compute/etc/sql_exporter/neon_perf_counters.sql
 create mode 100644 compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet
 create mode 100644 compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet
 create mode 100644 compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet
 create mode 100644 compute/etc/sql_exporter/pg_stats_userdb.libsonnet
 create mode 100644 compute/etc/sql_exporter/pg_stats_userdb.sql
 create mode 100644 compute/etc/sql_exporter/replication_delay_bytes.libsonnet
 create mode 100644 compute/etc/sql_exporter/replication_delay_bytes.sql
 create mode 100644 compute/etc/sql_exporter/replication_delay_seconds.libsonnet
 create mode 100644 compute/etc/sql_exporter/replication_delay_seconds.sql
 create mode 100644 compute/etc/sql_exporter/retained_wal.libsonnet
 create mode 100644 compute/etc/sql_exporter/retained_wal.sql
 create mode 100644 compute/etc/sql_exporter/wal_is_lost.libsonnet
 create mode 100644 compute/etc/sql_exporter/wal_is_lost.sql
 delete mode 100644 compute/etc/sql_exporter_autoscaling.yml

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 51f6975e63..c9a447626f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -120,6 +120,25 @@ jobs:
       - name: Run mypy to check types
         run: poetry run mypy .
 
+  check-codestyle-jsonnet:
+    needs: [ check-permissions, build-build-tools-image ]
+    runs-on: [ self-hosted, small ]
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Check Jsonnet code formatting
+        run: |
+          jsonnetfmt --test \
+            $(find . -type f -name '*.jsonnet' -o -name '*.libsonnet')
+
   # Check that the vendor/postgres-* submodules point to the
   # corresponding REL_*_STABLE_neon branches.
   check-submodules:
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index 54e9134257..7cba1c8635 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -27,6 +27,7 @@ RUN set -e \
         gnupg \
         gzip \
         jq \
+        jsonnet \
         libcurl4-openssl-dev \
         libbz2-dev \
         libffi-dev \
diff --git a/Makefile b/Makefile
index 5e227ed3f5..33cfda2661 100644
--- a/Makefile
+++ b/Makefile
@@ -291,6 +291,7 @@ postgres-check: \
 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
 clean: postgres-clean neon-pg-clean-ext
+	$(MAKE) -C compute clean
 	$(CARGO_CMD_PREFIX) cargo clean
 
 # This removes everything
diff --git a/compute/.gitignore b/compute/.gitignore
new file mode 100644
index 0000000000..70980d335a
--- /dev/null
+++ b/compute/.gitignore
@@ -0,0 +1,5 @@
+# sql_exporter config files generated from Jsonnet
+etc/neon_collector.yml
+etc/neon_collector_autoscaling.yml
+etc/sql_exporter.yml
+etc/sql_exporter_autoscaling.yml
diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node
index 412c64eda4..13381b2901 100644
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -349,7 +349,7 @@ ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
-# doesn't use releases, last commit f3d82fd - Mar 2, 2023 
+# doesn't use releases, last commit f3d82fd - Mar 2, 2023
 RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \
     echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \
     mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
@@ -1169,6 +1169,18 @@ RUN rm -r /usr/local/pgsql/include
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a
 
+#########################################################################################
+#
+# Preprocess the sql_exporter configuration files
+#
+#########################################################################################
+FROM $REPOSITORY/$IMAGE:$TAG AS sql_exporter_preprocessor
+
+USER nonroot
+
+COPY --chown=nonroot compute compute
+
+RUN make -C compute
 
 #########################################################################################
 #
@@ -1287,10 +1299,10 @@ RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
 COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
 COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter
 
-COPY --chmod=0644 compute/etc/sql_exporter.yml               /etc/sql_exporter.yml
-COPY --chmod=0644 compute/etc/neon_collector.yml             /etc/neon_collector.yml
-COPY --chmod=0644 compute/etc/sql_exporter_autoscaling.yml   /etc/sql_exporter_autoscaling.yml
-COPY --chmod=0644 compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
+COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml               /etc/sql_exporter.yml
+COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml             /etc/neon_collector.yml
+COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml   /etc/sql_exporter_autoscaling.yml
+COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
 
 # Create remote extension download directory
 RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
diff --git a/compute/Makefile b/compute/Makefile
new file mode 100644
index 0000000000..45fbfa6d5e
--- /dev/null
+++ b/compute/Makefile
@@ -0,0 +1,35 @@
+jsonnet_files = $(wildcard etc/*.jsonnet etc/*.libsonnet)
+
+.PHONY: all
+all: neon_collector.yml neon_collector_autoscaling.yml sql_exporter.yml sql_exporter_autoscaling.yml
+
+neon_collector.yml: $(jsonnet_files)
+	JSONNET_PATH=etc jsonnet \
+		--output-file etc/$@ \
+		etc/neon_collector.jsonnet
+
+neon_collector_autoscaling.yml: $(jsonnet_files)
+	JSONNET_PATH=etc jsonnet \
+		--output-file etc/$@ \
+		etc/neon_collector_autoscaling.jsonnet
+
+sql_exporter.yml: $(jsonnet_files)
+	JSONNET_PATH=etc jsonnet \
+		--output-file etc/$@ \
+		--tla-str collector_file=neon_collector.yml \
+		etc/sql_exporter.jsonnet
+
+sql_exporter_autoscaling.yml: $(jsonnet_files)
+	JSONNET_PATH=etc jsonnet \
+		--output-file etc/$@ \
+		--tla-str collector_file=neon_collector_autoscaling.yml \
+		--tla-str application_name=sql_exporter_autoscaling \
+		etc/sql_exporter.jsonnet
+
+.PHONY: clean
+clean:
+	rm --force \
+		etc/neon_collector.yml \
+		etc/neon_collector_autoscaling.yml \
+		etc/sql_exporter.yml \
+		etc/sql_exporter_autoscaling.yml
diff --git a/compute/etc/README.md b/compute/etc/README.md
new file mode 100644
index 0000000000..70b108146c
--- /dev/null
+++ b/compute/etc/README.md
@@ -0,0 +1,17 @@
+# Compute Configuration
+
+These files are the configuration files for various other pieces of software
+that will be running in the compute alongside Postgres.
+
+## `sql_exporter`
+
+### Adding a `sql_exporter` Metric
+
+We use `sql_exporter` to export various metrics from Postgres. In order to add
+a metric, you will need to create two files: a `libsonnet` and a `sql` file. You
+will then import the `libsonnet` file in one of the collector files, and the
+`sql` file will be imported in the `libsonnet` file.
+
+In the event your statistic is an LSN, you may want to cast it to a `float8`
+because Prometheus only supports floats. It's probably fine because `float8` can
+store integers from `-2^53` to `+2^53` exactly.
diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet
new file mode 100644
index 0000000000..2031eb8c85
--- /dev/null
+++ b/compute/etc/neon_collector.jsonnet
@@ -0,0 +1,43 @@
+{
+  collector_name: 'neon_collector',
+  metrics: [
+    import 'sql_exporter/checkpoints_req.libsonnet',
+    import 'sql_exporter/checkpoints_timed.libsonnet',
+    import 'sql_exporter/compute_current_lsn.libsonnet',
+    import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
+    import 'sql_exporter/compute_receive_lsn.libsonnet',
+    import 'sql_exporter/compute_subscriptions_count.libsonnet',
+    import 'sql_exporter/connection_counts.libsonnet',
+    import 'sql_exporter/db_total_size.libsonnet',
+    import 'sql_exporter/getpage_prefetch_discards_total.libsonnet',
+    import 'sql_exporter/getpage_prefetch_misses_total.libsonnet',
+    import 'sql_exporter/getpage_prefetch_requests_total.libsonnet',
+    import 'sql_exporter/getpage_sync_requests_total.libsonnet',
+    import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet',
+    import 'sql_exporter/getpage_wait_seconds_count.libsonnet',
+    import 'sql_exporter/getpage_wait_seconds_sum.libsonnet',
+    import 'sql_exporter/lfc_approximate_working_set_size.libsonnet',
+    import 'sql_exporter/lfc_approximate_working_set_size_windows.libsonnet',
+    import 'sql_exporter/lfc_cache_size_limit.libsonnet',
+    import 'sql_exporter/lfc_hits.libsonnet',
+    import 'sql_exporter/lfc_misses.libsonnet',
+    import 'sql_exporter/lfc_used.libsonnet',
+    import 'sql_exporter/lfc_writes.libsonnet',
+    import 'sql_exporter/logical_slot_restart_lsn.libsonnet',
+    import 'sql_exporter/max_cluster_size.libsonnet',
+    import 'sql_exporter/pageserver_disconnects_total.libsonnet',
+    import 'sql_exporter/pageserver_requests_sent_total.libsonnet',
+    import 'sql_exporter/pageserver_send_flushes_total.libsonnet',
+    import 'sql_exporter/pg_stats_userdb.libsonnet',
+    import 'sql_exporter/replication_delay_bytes.libsonnet',
+    import 'sql_exporter/replication_delay_seconds.libsonnet',
+    import 'sql_exporter/retained_wal.libsonnet',
+    import 'sql_exporter/wal_is_lost.libsonnet',
+  ],
+  queries: [
+    {
+      query_name: 'neon_perf_counters',
+      query: importstr 'sql_exporter/neon_perf_counters.sql',
+    },
+  ],
+}
diff --git a/compute/etc/neon_collector.yml b/compute/etc/neon_collector.yml
deleted file mode 100644
index 92da0cdbdd..0000000000
--- a/compute/etc/neon_collector.yml
+++ /dev/null
@@ -1,331 +0,0 @@
-collector_name: neon_collector
-metrics:
-- metric_name: lfc_misses
-  type: gauge
-  help: 'lfc_misses'
-  key_labels:
-  values: [lfc_misses]
-  query: |
-    select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
-
-- metric_name: lfc_used
-  type: gauge
-  help: 'LFC chunks used (chunk = 1MB)'
-  key_labels:
-  values: [lfc_used]
-  query: |
-    select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
-
-- metric_name: lfc_hits
-  type: gauge
-  help: 'lfc_hits'
-  key_labels:
-  values: [lfc_hits]
-  query: |
-    select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
-
-- metric_name: lfc_writes
-  type: gauge
-  help: 'lfc_writes'
-  key_labels:
-  values: [lfc_writes]
-  query: |
-    select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
-
-- metric_name: lfc_cache_size_limit
-  type: gauge
-  help: 'LFC cache size limit in bytes'
-  key_labels:
-  values: [lfc_cache_size_limit]
-  query: |
-    select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
-
-- metric_name: connection_counts
-  type: gauge
-  help: 'Connection counts'
-  key_labels:
-    - datname
-    - state
-  values: [count]
-  query: |
-    select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
-
-- metric_name: pg_stats_userdb
-  type: gauge
-  help: 'Stats for several oldest non-system dbs'
-  key_labels:
-    - datname
-  value_label: kind
-  values:
-    - db_size
-    - deadlocks
-    # Rows
-    - inserted
-    - updated
-    - deleted
-  # We export stats for 10 non-system database. Without this limit
-  # it is too easy to abuse the system by creating lots of databases.
-  query: |
-    select pg_database_size(datname) as db_size, deadlocks,
-       tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
-       datname
-     from pg_stat_database
-     where datname IN (
-       select datname
-       from pg_database
-       where datname <> 'postgres' and not datistemplate
-       order by oid
-       limit 10
-     );
-
-- metric_name: max_cluster_size
-  type: gauge
-  help: 'neon.max_cluster_size setting'
-  key_labels:
-  values: [max_cluster_size]
-  query: |
-    select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size';
-
-- metric_name: db_total_size
-  type: gauge
-  help: 'Size of all databases'
-  key_labels:
-  values: [total]
-  query: |
-    select sum(pg_database_size(datname)) as total from pg_database;
-
-- metric_name: getpage_wait_seconds_count
-  type: counter
-  help: 'Number of getpage requests'
-  values: [getpage_wait_seconds_count]
-  query_ref: neon_perf_counters
-
-- metric_name: getpage_wait_seconds_sum
-  type: counter
-  help: 'Time spent in getpage requests'
-  values: [getpage_wait_seconds_sum]
-  query_ref: neon_perf_counters
-
-- metric_name: getpage_prefetch_requests_total
-  type: counter
-  help: 'Number of getpage issued for prefetching'
-  values: [getpage_prefetch_requests_total]
-  query_ref: neon_perf_counters
-
-- metric_name: getpage_sync_requests_total
-  type: counter
-  help: 'Number of synchronous getpage issued'
-  values: [getpage_sync_requests_total]
-  query_ref: neon_perf_counters
-
-- metric_name: getpage_prefetch_misses_total
-  type: counter
-  help: 'Total number of readahead misses; consisting of either prefetches that don''t satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read'
-  values: [getpage_prefetch_misses_total]
-  query_ref: neon_perf_counters
-
-- metric_name: getpage_prefetch_discards_total
-  type: counter
-  help: 'Number of prefetch responses issued but not used'
-  values: [getpage_prefetch_discards_total]
-  query_ref: neon_perf_counters
-
-- metric_name: pageserver_requests_sent_total
-  type: counter
-  help: 'Number of all requests sent to the pageserver (not just GetPage requests)'
-  values: [pageserver_requests_sent_total]
-  query_ref: neon_perf_counters
-
-- metric_name: pageserver_disconnects_total
-  type: counter
-  help: 'Number of times that the connection to the pageserver was lost'
-  values: [pageserver_disconnects_total]
-  query_ref: neon_perf_counters
-
-- metric_name: pageserver_send_flushes_total
-  type: counter
-  help: 'Number of flushes to the pageserver connection'
-  values: [pageserver_send_flushes_total]
-  query_ref: neon_perf_counters
-
-- metric_name: getpage_wait_seconds_bucket
-  type: counter
-  help: 'Histogram buckets of getpage request latency'
-  key_labels:
-      - bucket_le
-  values: [value]
-  query_ref: getpage_wait_seconds_buckets
-
-# DEPRECATED
-- metric_name: lfc_approximate_working_set_size
-  type: gauge
-  help: 'Approximate working set size in pages of 8192 bytes'
-  key_labels:
-  values: [approximate_working_set_size]
-  query: |
-    select neon.approximate_working_set_size(false) as approximate_working_set_size;
-
-- metric_name: lfc_approximate_working_set_size_windows
-  type: gauge
-  help: 'Approximate working set size in pages of 8192 bytes'
-  key_labels: [duration]
-  values: [size]
-  # NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection
-  # of durations in a pretty-printed form.
-  query: |
-    select
-      x as duration,
-      neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size
-    from
-      (values ('5m'),('15m'),('1h')) as t (x);
-
-- metric_name: compute_current_lsn
-  type: gauge
-  help: 'Current LSN of the database'
-  key_labels:
-  values: [lsn]
-  query: |
-    select
-      case
-        when pg_catalog.pg_is_in_recovery()
-        then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
-        else (pg_current_wal_lsn() - '0/0')::FLOAT8
-      end as lsn;
-
-- metric_name: compute_receive_lsn
-  type: gauge
-  help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication'
-  key_labels:
-  values: [lsn]
-  query: |
-    SELECT
-      CASE
-        WHEN pg_catalog.pg_is_in_recovery()
-        THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
-        ELSE 0
-      END AS lsn;
-
-- metric_name: replication_delay_bytes
-  type: gauge
-  help: 'Bytes between received and replayed LSN'
-  key_labels:
-  values: [replication_delay_bytes]
-  # We use a GREATEST call here because this calculation can be negative.
-  # The calculation is not atomic, meaning after we've gotten the receive
-  # LSN, the replay LSN may have advanced past the receive LSN we
-  # are using for the calculation.
-  query: |
-    SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
-
-- metric_name: replication_delay_seconds
-  type: gauge
-  help: 'Time since last LSN was replayed'
-  key_labels:
-  values: [replication_delay_seconds]
-  query: |
-    SELECT
-      CASE
-        WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
-        ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
-     END AS replication_delay_seconds;
-
-- metric_name: checkpoints_req
-  type: gauge
-  help: 'Number of requested checkpoints'
-  key_labels:
-  values: [checkpoints_req]
-  query: |
-    SELECT checkpoints_req FROM pg_stat_bgwriter;
-
-- metric_name: checkpoints_timed
-  type: gauge
-  help: 'Number of scheduled checkpoints'
-  key_labels:
-  values: [checkpoints_timed]
-  query: |
-    SELECT checkpoints_timed FROM pg_stat_bgwriter;
-
-- metric_name: compute_logical_snapshot_files
-  type: gauge
-  help: 'Number of snapshot files in pg_logical/snapshot'
-  key_labels:
-    - timeline_id
-  values: [num_logical_snapshot_files]
-  query: |
-    SELECT
-      (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
-      -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
-      -- temporary snapshot files are renamed to the actual snapshot files after they are
-      -- completely built. We only WAL-log the completely built snapshot files.
-      (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
-
-# In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
-# It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
-
-# Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
-- metric_name: logical_slot_restart_lsn
-  type: gauge
-  help: 'restart_lsn of logical slots'
-  key_labels:
-    - slot_name
-  values: [restart_lsn]
-  query: |
-    select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
-    from pg_replication_slots
-    where slot_type = 'logical';
-
-- metric_name: compute_subscriptions_count
-  type: gauge
-  help: 'Number of logical replication subscriptions grouped by enabled/disabled'
-  key_labels:
-    - enabled
-  values: [subscriptions_count]
-  query: |
-    select subenabled::text as enabled, count(*) as subscriptions_count
-    from pg_subscription
-    group by subenabled;
-
-- metric_name: retained_wal
-  type: gauge
-  help: 'Retained WAL in inactive replication slots'
-  key_labels:
-    - slot_name
-  values: [retained_wal]
-  query: |
-    SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
-    FROM pg_replication_slots
-    WHERE active = false;
-
-- metric_name: wal_is_lost
-  type: gauge
-  help: 'Whether or not the replication slot wal_status is lost'
-  key_labels:
-    - slot_name
-  values: [wal_is_lost]
-  query: |
-    SELECT slot_name,
-           CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost
-    FROM pg_replication_slots;
-
-queries:
-  - query_name: neon_perf_counters
-    query: |
-      WITH c AS (
-        SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters
-      )
-      SELECT d.*
-      FROM pg_catalog.jsonb_to_record((select jb from c)) as d(
-          getpage_wait_seconds_count numeric,
-          getpage_wait_seconds_sum numeric,
-          getpage_prefetch_requests_total numeric,
-          getpage_sync_requests_total numeric,
-          getpage_prefetch_misses_total numeric,
-          getpage_prefetch_discards_total numeric,
-          pageserver_requests_sent_total numeric,
-          pageserver_disconnects_total numeric,
-          pageserver_send_flushes_total numeric
-      );
-
-  - query_name: getpage_wait_seconds_buckets
-    query: |
-      SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket';
diff --git a/compute/etc/neon_collector_autoscaling.jsonnet b/compute/etc/neon_collector_autoscaling.jsonnet
new file mode 100644
index 0000000000..e248172a3d
--- /dev/null
+++ b/compute/etc/neon_collector_autoscaling.jsonnet
@@ -0,0 +1,11 @@
+{
+  collector_name: 'neon_collector_autoscaling',
+  metrics: [
+    import 'sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet',
+    import 'sql_exporter/lfc_cache_size_limit.libsonnet',
+    import 'sql_exporter/lfc_hits.libsonnet',
+    import 'sql_exporter/lfc_misses.libsonnet',
+    import 'sql_exporter/lfc_used.libsonnet',
+    import 'sql_exporter/lfc_writes.libsonnet',
+  ],
+}
diff --git a/compute/etc/neon_collector_autoscaling.yml b/compute/etc/neon_collector_autoscaling.yml
deleted file mode 100644
index 5616264eba..0000000000
--- a/compute/etc/neon_collector_autoscaling.yml
+++ /dev/null
@@ -1,55 +0,0 @@
-collector_name: neon_collector_autoscaling
-metrics:
-- metric_name: lfc_misses
-  type: gauge
-  help: 'lfc_misses'
-  key_labels:
-  values: [lfc_misses]
-  query: |
-    select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
-
-- metric_name: lfc_used
-  type: gauge
-  help: 'LFC chunks used (chunk = 1MB)'
-  key_labels:
-  values: [lfc_used]
-  query: |
-    select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
-
-- metric_name: lfc_hits
-  type: gauge
-  help: 'lfc_hits'
-  key_labels:
-  values: [lfc_hits]
-  query: |
-    select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
-
-- metric_name: lfc_writes
-  type: gauge
-  help: 'lfc_writes'
-  key_labels:
-  values: [lfc_writes]
-  query: |
-    select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
-
-- metric_name: lfc_cache_size_limit
-  type: gauge
-  help: 'LFC cache size limit in bytes'
-  key_labels:
-  values: [lfc_cache_size_limit]
-  query: |
-    select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
-
-- metric_name: lfc_approximate_working_set_size_windows
-  type: gauge
-  help: 'Approximate working set size in pages of 8192 bytes'
-  key_labels: [duration_seconds]
-  values: [size]
-  # NOTE: This is the "internal" / "machine-readable" version. This outputs the working set
-  # size looking back 1..60 minutes, labeled with the number of minutes.
-  query: |
-    select
-      x::text as duration_seconds,
-      neon.approximate_working_set_size_seconds(x) as size
-    from
-      (select generate_series * 60 as x from generate_series(1, 60)) as t (x);
diff --git a/compute/etc/sql_exporter.jsonnet b/compute/etc/sql_exporter.jsonnet
new file mode 100644
index 0000000000..1e3665ac47
--- /dev/null
+++ b/compute/etc/sql_exporter.jsonnet
@@ -0,0 +1,40 @@
+function(collector_file, application_name='sql_exporter') {
+  // Configuration for sql_exporter for autoscaling-agent
+  // Global defaults.
+  global: {
+    // If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
+    scrape_timeout: '10s',
+    // Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
+    scrape_timeout_offset: '500ms',
+    // Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
+    min_interval: '0s',
+    // Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
+    // as will concurrent scrapes.
+    max_connections: 1,
+    // Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
+    // always be the same as max_connections.
+    max_idle_connections: 1,
+    // Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
+    // If 0, connections are not closed due to a connection's age.
+    max_connection_lifetime: '5m',
+  },
+
+  // The target to monitor and the collectors to execute on it.
+  target: {
+    // Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
+    // the schema gets dropped or replaced to match the driver expected DSN format.
+    data_source_name: std.format('postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=%s', [application_name]),
+
+    // Collectors (referenced by name) to execute on the target.
+    // Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+    collectors: [
+      'neon_collector_autoscaling',
+    ],
+  },
+
+  // Collector files specifies a list of globs. One collector definition is read from each matching file.
+  // Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+  collector_files: [
+    collector_file,
+  ],
+}
diff --git a/compute/etc/sql_exporter.yml b/compute/etc/sql_exporter.yml
deleted file mode 100644
index 139d04468a..0000000000
--- a/compute/etc/sql_exporter.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-# Configuration for sql_exporter
-# Global defaults.
-global:
-  # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
-  scrape_timeout: 10s
-  # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
-  scrape_timeout_offset: 500ms
-  # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
-  min_interval: 0s
-  # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
-  # as will concurrent scrapes.
-  max_connections: 1
-  # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
-  # always be the same as max_connections.
-  max_idle_connections: 1
-  # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
-  # If 0, connections are not closed due to a connection's age.
-  max_connection_lifetime: 5m
-
-# The target to monitor and the collectors to execute on it.
-target:
-  # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
-  # the schema gets dropped or replaced to match the driver expected DSN format.
-  data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter'
-
-  # Collectors (referenced by name) to execute on the target.
-  # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-  collectors: [neon_collector]
-
-# Collector files specifies a list of globs. One collector definition is read from each matching file.
-# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-collector_files:
-  - "neon_collector.yml"
diff --git a/compute/etc/sql_exporter/checkpoints_req.libsonnet b/compute/etc/sql_exporter/checkpoints_req.libsonnet
new file mode 100644
index 0000000000..8697f8af3b
--- /dev/null
+++ b/compute/etc/sql_exporter/checkpoints_req.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'checkpoints_req',
+  type: 'gauge',
+  help: 'Number of requested checkpoints',
+  key_labels: null,
+  values: [
+    'checkpoints_req',
+  ],
+  query: importstr 'sql_exporter/checkpoints_req.sql',
+}
diff --git a/compute/etc/sql_exporter/checkpoints_req.sql b/compute/etc/sql_exporter/checkpoints_req.sql
new file mode 100644
index 0000000000..eb8427c883
--- /dev/null
+++ b/compute/etc/sql_exporter/checkpoints_req.sql
@@ -0,0 +1 @@
+SELECT checkpoints_req FROM pg_stat_bgwriter;
diff --git a/compute/etc/sql_exporter/checkpoints_timed.libsonnet b/compute/etc/sql_exporter/checkpoints_timed.libsonnet
new file mode 100644
index 0000000000..9f0b742400
--- /dev/null
+++ b/compute/etc/sql_exporter/checkpoints_timed.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'checkpoints_timed',
+  type: 'gauge',
+  help: 'Number of scheduled checkpoints',
+  key_labels: null,
+  values: [
+    'checkpoints_timed',
+  ],
+  query: importstr 'sql_exporter/checkpoints_timed.sql',
+}
diff --git a/compute/etc/sql_exporter/checkpoints_timed.sql b/compute/etc/sql_exporter/checkpoints_timed.sql
new file mode 100644
index 0000000000..c50853134c
--- /dev/null
+++ b/compute/etc/sql_exporter/checkpoints_timed.sql
@@ -0,0 +1 @@
+SELECT checkpoints_timed FROM pg_stat_bgwriter;
diff --git a/compute/etc/sql_exporter/compute_current_lsn.libsonnet b/compute/etc/sql_exporter/compute_current_lsn.libsonnet
new file mode 100644
index 0000000000..ccff161358
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_current_lsn.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'compute_current_lsn',
+  type: 'gauge',
+  help: 'Current LSN of the database',
+  key_labels: null,
+  values: [
+    'lsn',
+  ],
+  query: importstr 'sql_exporter/compute_current_lsn.sql',
+}
diff --git a/compute/etc/sql_exporter/compute_current_lsn.sql b/compute/etc/sql_exporter/compute_current_lsn.sql
new file mode 100644
index 0000000000..be02b8a094
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_current_lsn.sql
@@ -0,0 +1,4 @@
+SELECT CASE
+  WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
+  ELSE (pg_current_wal_lsn() - '0/0')::FLOAT8
+END AS lsn;
diff --git a/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet b/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet
new file mode 100644
index 0000000000..212f079ccf
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'compute_logical_snapshot_files',
+  type: 'gauge',
+  help: 'Number of snapshot files in pg_logical/snapshot',
+  key_labels: [
+    'timeline_id',
+  ],
+  values: [
+    'num_logical_snapshot_files',
+  ],
+  query: importstr 'sql_exporter/compute_logical_snapshot_files.sql',
+}
diff --git a/compute/etc/sql_exporter/compute_logical_snapshot_files.sql b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql
new file mode 100644
index 0000000000..f2454235b7
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql
@@ -0,0 +1,7 @@
+SELECT
+  (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
+  -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
+  -- These temporary snapshot files are renamed to the actual snapshot files
+  -- after they are completely built. We only WAL-log the completely built
+  -- snapshot files
+  (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
diff --git a/compute/etc/sql_exporter/compute_receive_lsn.libsonnet b/compute/etc/sql_exporter/compute_receive_lsn.libsonnet
new file mode 100644
index 0000000000..eb68a77ec2
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_receive_lsn.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'compute_receive_lsn',
+  type: 'gauge',
+  help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication',
+  key_labels: null,
+  values: [
+    'lsn',
+  ],
+  query: importstr 'sql_exporter/compute_receive_lsn.sql',
+}
diff --git a/compute/etc/sql_exporter/compute_receive_lsn.sql b/compute/etc/sql_exporter/compute_receive_lsn.sql
new file mode 100644
index 0000000000..318b31ab41
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_receive_lsn.sql
@@ -0,0 +1,4 @@
+SELECT CASE
+  WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
+  ELSE 0
+END AS lsn;
diff --git a/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet b/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet
new file mode 100644
index 0000000000..e1575da397
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'compute_subscriptions_count',
+  type: 'gauge',
+  help: 'Number of logical replication subscriptions grouped by enabled/disabled',
+  key_labels: [
+    'enabled',
+  ],
+  values: [
+    'subscriptions_count',
+  ],
+  query: importstr 'sql_exporter/compute_subscriptions_count.sql',
+}
diff --git a/compute/etc/sql_exporter/compute_subscriptions_count.sql b/compute/etc/sql_exporter/compute_subscriptions_count.sql
new file mode 100644
index 0000000000..50740cb5df
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_subscriptions_count.sql
@@ -0,0 +1 @@
+SELECT subenabled::text AS enabled, count(*) AS subscriptions_count FROM pg_subscription GROUP BY subenabled;
diff --git a/compute/etc/sql_exporter/connection_counts.libsonnet b/compute/etc/sql_exporter/connection_counts.libsonnet
new file mode 100644
index 0000000000..9f94db67a9
--- /dev/null
+++ b/compute/etc/sql_exporter/connection_counts.libsonnet
@@ -0,0 +1,13 @@
+{
+  metric_name: 'connection_counts',
+  type: 'gauge',
+  help: 'Connection counts',
+  key_labels: [
+    'datname',
+    'state',
+  ],
+  values: [
+    'count',
+  ],
+  query: importstr 'sql_exporter/connection_counts.sql',
+}
diff --git a/compute/etc/sql_exporter/connection_counts.sql b/compute/etc/sql_exporter/connection_counts.sql
new file mode 100644
index 0000000000..6824480fdb
--- /dev/null
+++ b/compute/etc/sql_exporter/connection_counts.sql
@@ -0,0 +1 @@
+SELECT datname, state, count(*) AS count FROM pg_stat_activity WHERE state <> '' GROUP BY datname, state;
diff --git a/compute/etc/sql_exporter/db_total_size.libsonnet b/compute/etc/sql_exporter/db_total_size.libsonnet
new file mode 100644
index 0000000000..6e08d5fb87
--- /dev/null
+++ b/compute/etc/sql_exporter/db_total_size.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'db_total_size',
+  type: 'gauge',
+  help: 'Size of all databases',
+  key_labels: null,
+  values: [
+    'total',
+  ],
+  query: importstr 'sql_exporter/db_total_size.sql',
+}
diff --git a/compute/etc/sql_exporter/db_total_size.sql b/compute/etc/sql_exporter/db_total_size.sql
new file mode 100644
index 0000000000..9cbbdfd8a3
--- /dev/null
+++ b/compute/etc/sql_exporter/db_total_size.sql
@@ -0,0 +1 @@
+SELECT sum(pg_database_size(datname)) AS total FROM pg_database;
diff --git a/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet b/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet
new file mode 100644
index 0000000000..935e35d2e4
--- /dev/null
+++ b/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'getpage_prefetch_discards_total',
+  type: 'counter',
+  help: 'Number of prefetch responses issued but not used',
+  values: [
+    'getpage_prefetch_discards_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet b/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet
new file mode 100644
index 0000000000..b9a9632105
--- /dev/null
+++ b/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'getpage_prefetch_misses_total',
+  type: 'counter',
+  help: "Total number of readahead misses; consisting of either prefetches that don't satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read",
+  values: [
+    'getpage_prefetch_misses_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet b/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet
new file mode 100644
index 0000000000..75fdb6717b
--- /dev/null
+++ b/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'getpage_prefetch_requests_total',
+  type: 'counter',
+  help: 'Number of getpage issued for prefetching',
+  values: [
+    'getpage_prefetch_requests_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet b/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet
new file mode 100644
index 0000000000..f3a1e6b339
--- /dev/null
+++ b/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'getpage_sync_requests_total',
+  type: 'counter',
+  help: 'Number of synchronous getpage issued',
+  values: [
+    'getpage_sync_requests_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet
new file mode 100644
index 0000000000..2adda2ad03
--- /dev/null
+++ b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'getpage_wait_seconds_bucket',
+  type: 'counter',
+  help: 'Histogram buckets of getpage request latency',
+  key_labels: [
+    'bucket_le',
+  ],
+  values: [
+    'value',
+  ],
+  query: importstr 'sql_exporter/getpage_wait_seconds_bucket.sql',
+}
diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql
new file mode 100644
index 0000000000..b4a6bc1560
--- /dev/null
+++ b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql
@@ -0,0 +1 @@
+SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket';
diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet b/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet
new file mode 100644
index 0000000000..d2326974fc
--- /dev/null
+++ b/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'getpage_wait_seconds_count',
+  type: 'counter',
+  help: 'Number of getpage requests',
+  values: [
+    'getpage_wait_seconds_count',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet b/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet
new file mode 100644
index 0000000000..844c8419ff
--- /dev/null
+++ b/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'getpage_wait_seconds_sum',
+  type: 'counter',
+  help: 'Time spent in getpage requests',
+  values: [
+    'getpage_wait_seconds_sum',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet b/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet
new file mode 100644
index 0000000000..78859ce60d
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet
@@ -0,0 +1,12 @@
+// DEPRECATED
+
+{
+  metric_name: 'lfc_approximate_working_set_size',
+  type: 'gauge',
+  help: 'Approximate working set size in pages of 8192 bytes',
+  key_labels: null,
+  values: [
+    'approximate_working_set_size',
+  ],
+  query: importstr 'sql_exporter/lfc_approximate_working_set_size.sql',
+}
diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql
new file mode 100644
index 0000000000..de509ebb47
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql
@@ -0,0 +1 @@
+SELECT neon.approximate_working_set_size(false) AS approximate_working_set_size;
diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet
new file mode 100644
index 0000000000..a54deca467
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'lfc_approximate_working_set_size_windows',
+  type: 'gauge',
+  help: 'Approximate working set size in pages of 8192 bytes',
+  key_labels: [
+    'duration_seconds',
+  ],
+  values: [
+    'size',
+  ],
+  query: importstr 'sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql',
+}
diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql
new file mode 100644
index 0000000000..35fa42c34c
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql
@@ -0,0 +1,8 @@
+-- NOTE: This is the "internal" / "machine-readable" version. This outputs the
+-- working set size looking back 1..60 minutes, labeled with the number of
+-- minutes.
+
+SELECT
+  x::text as duration_seconds,
+  neon.approximate_working_set_size_seconds(x) AS size
+FROM (SELECT generate_series * 60 AS x FROM generate_series(1, 60)) AS t (x);
diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet
new file mode 100644
index 0000000000..4970bd2c7f
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'lfc_approximate_working_set_size_windows',
+  type: 'gauge',
+  help: 'Approximate working set size in pages of 8192 bytes',
+  key_labels: [
+    'duration',
+  ],
+  values: [
+    'size',
+  ],
+  query: importstr 'sql_exporter/lfc_approximate_working_set_size_windows.sql',
+}
diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql
new file mode 100644
index 0000000000..46c7d1610c
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql
@@ -0,0 +1,8 @@
+-- NOTE: This is the "public" / "human-readable" version. Here, we supply a
+-- small selection of durations in a pretty-printed form.
+
+SELECT
+  x AS duration,
+  neon.approximate_working_set_size_seconds(extract('epoch' FROM x::interval)::int) AS size FROM (
+    VALUES ('5m'), ('15m'), ('1h')
+  ) AS t (x);
diff --git a/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet b/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet
new file mode 100644
index 0000000000..4cbbd76621
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'lfc_cache_size_limit',
+  type: 'gauge',
+  help: 'LFC cache size limit in bytes',
+  key_labels: null,
+  values: [
+    'lfc_cache_size_limit',
+  ],
+  query: importstr 'sql_exporter/lfc_cache_size_limit.sql',
+}
diff --git a/compute/etc/sql_exporter/lfc_cache_size_limit.sql b/compute/etc/sql_exporter/lfc_cache_size_limit.sql
new file mode 100644
index 0000000000..378904c1fe
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_cache_size_limit.sql
@@ -0,0 +1 @@
+SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit;
diff --git a/compute/etc/sql_exporter/lfc_hits.libsonnet b/compute/etc/sql_exporter/lfc_hits.libsonnet
new file mode 100644
index 0000000000..4a0b7671bf
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_hits.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'lfc_hits',
+  type: 'gauge',
+  help: 'lfc_hits',
+  key_labels: null,
+  values: [
+    'lfc_hits',
+  ],
+  query: importstr 'sql_exporter/lfc_hits.sql',
+}
diff --git a/compute/etc/sql_exporter/lfc_hits.sql b/compute/etc/sql_exporter/lfc_hits.sql
new file mode 100644
index 0000000000..2e14f5c73c
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_hits.sql
@@ -0,0 +1 @@
+SELECT lfc_value AS lfc_hits FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_hits';
diff --git a/compute/etc/sql_exporter/lfc_misses.libsonnet b/compute/etc/sql_exporter/lfc_misses.libsonnet
new file mode 100644
index 0000000000..302998d04f
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_misses.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'lfc_misses',
+  type: 'gauge',
+  help: 'lfc_misses',
+  key_labels: null,
+  values: [
+    'lfc_misses',
+  ],
+  query: importstr 'sql_exporter/lfc_misses.sql',
+}
diff --git a/compute/etc/sql_exporter/lfc_misses.sql b/compute/etc/sql_exporter/lfc_misses.sql
new file mode 100644
index 0000000000..27ed4ecf86
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_misses.sql
@@ -0,0 +1 @@
+SELECT lfc_value AS lfc_misses FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_misses';
diff --git a/compute/etc/sql_exporter/lfc_used.libsonnet b/compute/etc/sql_exporter/lfc_used.libsonnet
new file mode 100644
index 0000000000..23891dadaf
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_used.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'lfc_used',
+  type: 'gauge',
+  help: 'LFC chunks used (chunk = 1MB)',
+  key_labels: null,
+  values: [
+    'lfc_used',
+  ],
+  query: importstr 'sql_exporter/lfc_used.sql',
+}
diff --git a/compute/etc/sql_exporter/lfc_used.sql b/compute/etc/sql_exporter/lfc_used.sql
new file mode 100644
index 0000000000..4f01545f30
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_used.sql
@@ -0,0 +1 @@
+SELECT lfc_value AS lfc_used FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_used';
diff --git a/compute/etc/sql_exporter/lfc_writes.libsonnet b/compute/etc/sql_exporter/lfc_writes.libsonnet
new file mode 100644
index 0000000000..6a22ee1dd9
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_writes.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'lfc_writes',
+  type: 'gauge',
+  help: 'lfc_writes',
+  key_labels: null,
+  values: [
+    'lfc_writes',
+  ],
+  query: importstr 'sql_exporter/lfc_writes.sql',
+}
diff --git a/compute/etc/sql_exporter/lfc_writes.sql b/compute/etc/sql_exporter/lfc_writes.sql
new file mode 100644
index 0000000000..37c9abc9cf
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_writes.sql
@@ -0,0 +1 @@
+SELECT lfc_value AS lfc_writes FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_writes';
diff --git a/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet b/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet
new file mode 100644
index 0000000000..8ef31b5d8d
--- /dev/null
+++ b/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet
@@ -0,0 +1,15 @@
+// Number of slots is limited by max_replication_slots, so collecting position
+// for all of them shouldn't be bad.
+
+{
+  metric_name: 'logical_slot_restart_lsn',
+  type: 'gauge',
+  help: 'restart_lsn of logical slots',
+  key_labels: [
+    'slot_name',
+  ],
+  values: [
+    'restart_lsn',
+  ],
+  query: importstr 'sql_exporter/logical_slot_restart_lsn.sql',
+}
diff --git a/compute/etc/sql_exporter/logical_slot_restart_lsn.sql b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql
new file mode 100644
index 0000000000..1b1c038501
--- /dev/null
+++ b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql
@@ -0,0 +1,3 @@
+SELECT slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
+FROM pg_replication_slots
+WHERE slot_type = 'logical';
diff --git a/compute/etc/sql_exporter/max_cluster_size.libsonnet b/compute/etc/sql_exporter/max_cluster_size.libsonnet
new file mode 100644
index 0000000000..1352fb77ee
--- /dev/null
+++ b/compute/etc/sql_exporter/max_cluster_size.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'max_cluster_size',
+  type: 'gauge',
+  help: 'neon.max_cluster_size setting',
+  key_labels: null,
+  values: [
+    'max_cluster_size',
+  ],
+  query: importstr 'sql_exporter/max_cluster_size.sql',
+}
diff --git a/compute/etc/sql_exporter/max_cluster_size.sql b/compute/etc/sql_exporter/max_cluster_size.sql
new file mode 100644
index 0000000000..2d2355a9a7
--- /dev/null
+++ b/compute/etc/sql_exporter/max_cluster_size.sql
@@ -0,0 +1 @@
+SELECT setting::int AS max_cluster_size FROM pg_settings WHERE name = 'neon.max_cluster_size';
diff --git a/compute/etc/sql_exporter/neon_perf_counters.sql b/compute/etc/sql_exporter/neon_perf_counters.sql
new file mode 100644
index 0000000000..58998907a0
--- /dev/null
+++ b/compute/etc/sql_exporter/neon_perf_counters.sql
@@ -0,0 +1,13 @@
+WITH c AS (SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters)
+
+SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d(
+  getpage_wait_seconds_count numeric,
+  getpage_wait_seconds_sum numeric,
+  getpage_prefetch_requests_total numeric,
+  getpage_sync_requests_total numeric,
+  getpage_prefetch_misses_total numeric,
+  getpage_prefetch_discards_total numeric,
+  pageserver_requests_sent_total numeric,
+  pageserver_disconnects_total numeric,
+  pageserver_send_flushes_total numeric
+);
diff --git a/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet b/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet
new file mode 100644
index 0000000000..5ad9ba078e
--- /dev/null
+++ b/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'pageserver_disconnects_total',
+  type: 'counter',
+  help: 'Number of times that the connection to the pageserver was lost',
+  values: [
+    'pageserver_disconnects_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet b/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet
new file mode 100644
index 0000000000..c191e2467f
--- /dev/null
+++ b/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'pageserver_requests_sent_total',
+  type: 'counter',
+  help: 'Number of all requests sent to the pageserver (not just GetPage requests)',
+  values: [
+    'pageserver_requests_sent_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet b/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet
new file mode 100644
index 0000000000..9fa5f77758
--- /dev/null
+++ b/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'pageserver_send_flushes_total',
+  type: 'counter',
+  help: 'Number of flushes to the pageserver connection',
+  values: [
+    'pageserver_send_flushes_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/pg_stats_userdb.libsonnet b/compute/etc/sql_exporter/pg_stats_userdb.libsonnet
new file mode 100644
index 0000000000..46ea2f4192
--- /dev/null
+++ b/compute/etc/sql_exporter/pg_stats_userdb.libsonnet
@@ -0,0 +1,18 @@
+{
+  metric_name: 'pg_stats_userdb',
+  type: 'gauge',
+  help: 'Stats for several oldest non-system dbs',
+  key_labels: [
+    'datname',
+  ],
+  value_label: 'kind',
+  values: [
+    'db_size',
+    'deadlocks',
+    // Rows
+    'inserted',
+    'updated',
+    'deleted',
+  ],
+  query: importstr 'sql_exporter/pg_stats_userdb.sql',
+}
diff --git a/compute/etc/sql_exporter/pg_stats_userdb.sql b/compute/etc/sql_exporter/pg_stats_userdb.sql
new file mode 100644
index 0000000000..00ada87370
--- /dev/null
+++ b/compute/etc/sql_exporter/pg_stats_userdb.sql
@@ -0,0 +1,10 @@
+-- We export stats for 10 non-system databases. Without this limit it is too
+-- easy to abuse the system by creating lots of databases.
+
+SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted,
+  tup_updated AS updated, tup_deleted AS deleted, datname
+FROM pg_stat_database
+WHERE datname IN (
+  SELECT datname FROM pg_database
+  WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10
+);
diff --git a/compute/etc/sql_exporter/replication_delay_bytes.libsonnet b/compute/etc/sql_exporter/replication_delay_bytes.libsonnet
new file mode 100644
index 0000000000..3e5bb6af1f
--- /dev/null
+++ b/compute/etc/sql_exporter/replication_delay_bytes.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'replication_delay_bytes',
+  type: 'gauge',
+  help: 'Bytes between received and replayed LSN',
+  key_labels: null,
+  values: [
+    'replication_delay_bytes',
+  ],
+  query: importstr 'sql_exporter/replication_delay_bytes.sql',
+}
diff --git a/compute/etc/sql_exporter/replication_delay_bytes.sql b/compute/etc/sql_exporter/replication_delay_bytes.sql
new file mode 100644
index 0000000000..60a6981acd
--- /dev/null
+++ b/compute/etc/sql_exporter/replication_delay_bytes.sql
@@ -0,0 +1,6 @@
+-- We use a GREATEST call here because this calculation can be negative. The
+-- calculation is not atomic, meaning after we've gotten the receive LSN, the
+-- replay LSN may have advanced past the receive LSN we are using for the
+-- calculation.
+
+SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
diff --git a/compute/etc/sql_exporter/replication_delay_seconds.libsonnet b/compute/etc/sql_exporter/replication_delay_seconds.libsonnet
new file mode 100644
index 0000000000..d3f2c21b54
--- /dev/null
+++ b/compute/etc/sql_exporter/replication_delay_seconds.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'replication_delay_seconds',
+  type: 'gauge',
+  help: 'Time since last LSN was replayed',
+  key_labels: null,
+  values: [
+    'replication_delay_seconds',
+  ],
+  query: importstr 'sql_exporter/replication_delay_seconds.sql',
+}
diff --git a/compute/etc/sql_exporter/replication_delay_seconds.sql b/compute/etc/sql_exporter/replication_delay_seconds.sql
new file mode 100644
index 0000000000..a76809ad74
--- /dev/null
+++ b/compute/etc/sql_exporter/replication_delay_seconds.sql
@@ -0,0 +1,5 @@
+SELECT
+  CASE
+    WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
+    ELSE GREATEST(0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
+  END AS replication_delay_seconds;
diff --git a/compute/etc/sql_exporter/retained_wal.libsonnet b/compute/etc/sql_exporter/retained_wal.libsonnet
new file mode 100644
index 0000000000..f9eff5faa5
--- /dev/null
+++ b/compute/etc/sql_exporter/retained_wal.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'retained_wal',
+  type: 'gauge',
+  help: 'Retained WAL in inactive replication slots',
+  key_labels: [
+    'slot_name',
+  ],
+  values: [
+    'retained_wal',
+  ],
+  query: importstr 'sql_exporter/retained_wal.sql',
+}
diff --git a/compute/etc/sql_exporter/retained_wal.sql b/compute/etc/sql_exporter/retained_wal.sql
new file mode 100644
index 0000000000..6c58359461
--- /dev/null
+++ b/compute/etc/sql_exporter/retained_wal.sql
@@ -0,0 +1,5 @@
+SELECT
+  slot_name,
+  pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
+FROM pg_replication_slots
+WHERE active = false;
diff --git a/compute/etc/sql_exporter/wal_is_lost.libsonnet b/compute/etc/sql_exporter/wal_is_lost.libsonnet
new file mode 100644
index 0000000000..3cd25f4b39
--- /dev/null
+++ b/compute/etc/sql_exporter/wal_is_lost.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'wal_is_lost',
+  type: 'gauge',
+  help: 'Whether or not the replication slot wal_status is lost',
+  key_labels: [
+    'slot_name',
+  ],
+  values: [
+    'wal_is_lost',
+  ],
+  query: importstr 'sql_exporter/wal_is_lost.sql',
+}
diff --git a/compute/etc/sql_exporter/wal_is_lost.sql b/compute/etc/sql_exporter/wal_is_lost.sql
new file mode 100644
index 0000000000..5521270851
--- /dev/null
+++ b/compute/etc/sql_exporter/wal_is_lost.sql
@@ -0,0 +1,7 @@
+SELECT
+  slot_name,
+  CASE
+    WHEN wal_status = 'lost' THEN 1
+    ELSE 0
+  END AS wal_is_lost
+FROM pg_replication_slots;
diff --git a/compute/etc/sql_exporter_autoscaling.yml b/compute/etc/sql_exporter_autoscaling.yml
deleted file mode 100644
index 044557233e..0000000000
--- a/compute/etc/sql_exporter_autoscaling.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-# Configuration for sql_exporter for autoscaling-agent
-# Global defaults.
-global:
-  # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
-  scrape_timeout: 10s
-  # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
-  scrape_timeout_offset: 500ms
-  # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
-  min_interval: 0s
-  # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
-  # as will concurrent scrapes.
-  max_connections: 1
-  # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
-  # always be the same as max_connections.
-  max_idle_connections: 1
-  # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
-  # If 0, connections are not closed due to a connection's age.
-  max_connection_lifetime: 5m
-
-# The target to monitor and the collectors to execute on it.
-target:
-  # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
-  # the schema gets dropped or replaced to match the driver expected DSN format.
-  data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling'
-
-  # Collectors (referenced by name) to execute on the target.
-  # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-  collectors: [neon_collector_autoscaling]
-
-# Collector files specifies a list of globs. One collector definition is read from each matching file.
-# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-collector_files:
-  - "neon_collector_autoscaling.yml"

From f1eb7032569c35ec47806c5e736486508d559439 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 15 Oct 2024 16:35:21 -0400
Subject: [PATCH 15/57] fix(pageserver): use a buffer for basebackup; add aux
 basebackup metrics log (#9401)

Our replication bench project is stuck because it is too slow to
generate basebackup and it caused compute to disconnect.

https://neondb.slack.com/archives/C03438W3FLZ/p1728330685012419

The compute timeout for waiting for basebackup is 10m (is it true?).
Generating basebackup directly on pageserver takes ~3min. Therefore, I
suspect it's because there are too many wasted round-trip time for
writing the 10000+ snapshot aux files. Also, it is possible that the
basebackup process takes too long time retrieving all aux files that it
did not write anything over the wire protocol, causing a read timeout.

Basebackup size is 800KB gzipped for that project and was 55MB tar
before compression.

## Summary of changes

* Potentially fix the issue by placing a write buffer for basebackup.
* Log how many aux files did we read + the time spent on it.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/basebackup.rs   | 21 +++++++++++++++++----
 pageserver/src/page_service.rs | 10 +++++++---
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index a32d09f3b3..975318419f 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -16,7 +16,7 @@ use fail::fail_point;
 use pageserver_api::key::Key;
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
-use std::time::SystemTime;
+use std::time::{Instant, SystemTime};
 use tokio::io;
 use tokio::io::AsyncWrite;
 use tracing::*;
@@ -352,12 +352,25 @@ where
             }
         }
 
-        for (path, content) in self
+        let start_time = Instant::now();
+        let aux_files = self
             .timeline
             .list_aux_files(self.lsn, self.ctx)
             .await
-            .map_err(|e| BasebackupError::Server(e.into()))?
-        {
+            .map_err(|e| BasebackupError::Server(e.into()))?;
+        let aux_scan_time = start_time.elapsed();
+        let aux_estimated_size = aux_files
+            .values()
+            .map(|content| content.len())
+            .sum::<usize>();
+        info!(
+            "Scanned {} aux files in {}ms, aux file content size = {}",
+            aux_files.len(),
+            aux_scan_time.as_millis(),
+            aux_estimated_size
+        );
+
+        for (path, content) in aux_files {
             if path.starts_with("pg_replslot") {
                 let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
                 let restart_lsn = Lsn(u64::from_le_bytes(
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 8fa6b9a7f0..afb2f92ff8 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -26,8 +26,8 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::SystemTime;
 use std::time::{Duration, Instant};
-use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::io::{AsyncWriteExt, BufWriter};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -1137,10 +1137,10 @@ impl PageServerHandler {
             .await
             .map_err(map_basebackup_error)?;
         } else {
-            let mut writer = pgb.copyout_writer();
+            let mut writer = BufWriter::new(pgb.copyout_writer());
             if gzip {
                 let mut encoder = GzipEncoder::with_quality(
-                    writer,
+                    &mut writer,
                     // NOTE using fast compression because it's on the critical path
                     //      for compute startup. For an empty database, we get
                     //      <100KB with this method. The Level::Best compression method
@@ -1175,6 +1175,10 @@ impl PageServerHandler {
                 .await
                 .map_err(map_basebackup_error)?;
             }
+            writer
+                .flush()
+                .await
+                .map_err(|e| map_basebackup_error(BasebackupError::Client(e)))?;
         }
 
         pgb.write_message_noflush(&BeMessage::CopyDone)

From 18f4e5f10cd1eeaa5a5949f9a6130983691311d6 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Tue, 15 Oct 2024 23:13:31 +0200
Subject: [PATCH 16/57] Add newly added metrics from neondatabase/neon#9116 to
 exports (#9402)

They weren't added in that PR, but should be available immediately on
rollout as the neon extension already defaults to 1.5.
---
 compute/etc/neon_collector.jsonnet                   |  8 ++++++++
 .../file_cache_read_wait_seconds_bucket.libsonnet    | 12 ++++++++++++
 .../file_cache_read_wait_seconds_bucket.sql          |  1 +
 .../file_cache_read_wait_seconds_count.libsonnet     |  9 +++++++++
 .../file_cache_read_wait_seconds_sum.libsonnet       |  9 +++++++++
 .../file_cache_write_wait_seconds_bucket.libsonnet   | 12 ++++++++++++
 .../file_cache_write_wait_seconds_bucket.sql         |  1 +
 .../file_cache_write_wait_seconds_count.libsonnet    |  9 +++++++++
 .../file_cache_write_wait_seconds_sum.libsonnet      |  9 +++++++++
 .../getpage_prefetches_buffered.libsonnet            |  9 +++++++++
 compute/etc/sql_exporter/neon_perf_counters.sql      |  8 +++++++-
 .../sql_exporter/pageserver_open_requests.libsonnet  |  9 +++++++++
 12 files changed, 95 insertions(+), 1 deletion(-)
 create mode 100644 compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet
 create mode 100644 compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql
 create mode 100644 compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet
 create mode 100644 compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet
 create mode 100644 compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet
 create mode 100644 compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql
 create mode 100644 compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet
 create mode 100644 compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet
 create mode 100644 compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet
 create mode 100644 compute/etc/sql_exporter/pageserver_open_requests.libsonnet

diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet
index 2031eb8c85..8b43ebe7a3 100644
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -9,9 +9,16 @@
     import 'sql_exporter/compute_subscriptions_count.libsonnet',
     import 'sql_exporter/connection_counts.libsonnet',
     import 'sql_exporter/db_total_size.libsonnet',
+    import 'sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet',
+    import 'sql_exporter/file_cache_read_wait_seconds_count.libsonnet',
+    import 'sql_exporter/file_cache_read_wait_seconds_sum.libsonnet',
+    import 'sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet',
+    import 'sql_exporter/file_cache_write_wait_seconds_count.libsonnet',
+    import 'sql_exporter/file_cache_write_wait_seconds_sum.libsonnet',
     import 'sql_exporter/getpage_prefetch_discards_total.libsonnet',
     import 'sql_exporter/getpage_prefetch_misses_total.libsonnet',
     import 'sql_exporter/getpage_prefetch_requests_total.libsonnet',
+    import 'sql_exporter/getpage_prefetches_buffered.libsonnet',
     import 'sql_exporter/getpage_sync_requests_total.libsonnet',
     import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet',
     import 'sql_exporter/getpage_wait_seconds_count.libsonnet',
@@ -28,6 +35,7 @@
     import 'sql_exporter/pageserver_disconnects_total.libsonnet',
     import 'sql_exporter/pageserver_requests_sent_total.libsonnet',
     import 'sql_exporter/pageserver_send_flushes_total.libsonnet',
+    import 'sql_exporter/pageserver_open_requests.libsonnet',
     import 'sql_exporter/pg_stats_userdb.libsonnet',
     import 'sql_exporter/replication_delay_bytes.libsonnet',
     import 'sql_exporter/replication_delay_seconds.libsonnet',
diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet
new file mode 100644
index 0000000000..d13f657a7f
--- /dev/null
+++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'file_cache_read_wait_seconds_bucket',
+  type: 'counter',
+  help: 'Histogram buckets of LFC read operation latencies',
+  key_labels: [
+    'bucket_le',
+  ],
+  values: [
+    'value',
+  ],
+  query: importstr 'sql_exporter/file_cache_read_wait_seconds_bucket.sql',
+}
diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql
new file mode 100644
index 0000000000..09047bf0c4
--- /dev/null
+++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql
@@ -0,0 +1 @@
+SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'file_cache_read_wait_seconds_bucket';
diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet b/compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet
new file mode 100644
index 0000000000..aa028b0f5e
--- /dev/null
+++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'file_cache_read_wait_seconds_count',
+  type: 'counter',
+  help: 'Number of read operations in LFC',
+  values: [
+    'file_cache_read_wait_seconds_count',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet b/compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet
new file mode 100644
index 0000000000..2547aabf3d
--- /dev/null
+++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'file_cache_read_wait_seconds_sum',
+  type: 'counter',
+  help: 'Time spent in LFC read operations',
+  values: [
+    'file_cache_read_wait_seconds_sum',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet
new file mode 100644
index 0000000000..13dbc77f76
--- /dev/null
+++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'file_cache_write_wait_seconds_bucket',
+  type: 'counter',
+  help: 'Histogram buckets of LFC write operation latencies',
+  key_labels: [
+    'bucket_le',
+  ],
+  values: [
+    'value',
+  ],
+  query: importstr 'sql_exporter/file_cache_write_wait_seconds_bucket.sql',
+}
diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql
new file mode 100644
index 0000000000..d03613cf91
--- /dev/null
+++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql
@@ -0,0 +1 @@
+SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'file_cache_write_wait_seconds_bucket';
diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet b/compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet
new file mode 100644
index 0000000000..6227d3193a
--- /dev/null
+++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'file_cache_write_wait_seconds_count',
+  type: 'counter',
+  help: 'Number of write operations in LFC',
+  values: [
+    'file_cache_write_wait_seconds_count',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet b/compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet
new file mode 100644
index 0000000000..2acfe7f608
--- /dev/null
+++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'file_cache_write_wait_seconds_sum',
+  type: 'counter',
+  help: 'Time spent in LFC write operations',
+  values: [
+    'file_cache_write_wait_seconds_sum',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet b/compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet
new file mode 100644
index 0000000000..8926d867c9
--- /dev/null
+++ b/compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'getpage_prefetches_buffered',
+  type: 'gauge',
+  help: 'Number of prefetched pages buffered in neon',
+  values: [
+    'getpage_prefetches_buffered',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/neon_perf_counters.sql b/compute/etc/sql_exporter/neon_perf_counters.sql
index 58998907a0..4a36f3bf2f 100644
--- a/compute/etc/sql_exporter/neon_perf_counters.sql
+++ b/compute/etc/sql_exporter/neon_perf_counters.sql
@@ -1,13 +1,19 @@
 WITH c AS (SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters)
 
 SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d(
+  file_cache_read_wait_seconds_count numeric,
+  file_cache_read_wait_seconds_sum numeric,
+  file_cache_write_wait_seconds_count numeric,
+  file_cache_write_wait_seconds_sum numeric,
   getpage_wait_seconds_count numeric,
   getpage_wait_seconds_sum numeric,
   getpage_prefetch_requests_total numeric,
   getpage_sync_requests_total numeric,
   getpage_prefetch_misses_total numeric,
   getpage_prefetch_discards_total numeric,
+  getpage_prefetches_buffered numeric,
   pageserver_requests_sent_total numeric,
   pageserver_disconnects_total numeric,
-  pageserver_send_flushes_total numeric
+  pageserver_send_flushes_total numeric,
+  pageserver_open_requests numeric
 );
diff --git a/compute/etc/sql_exporter/pageserver_open_requests.libsonnet b/compute/etc/sql_exporter/pageserver_open_requests.libsonnet
new file mode 100644
index 0000000000..dca89ea64a
--- /dev/null
+++ b/compute/etc/sql_exporter/pageserver_open_requests.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'pageserver_open_requests',
+  type: 'gauge',
+  help: 'Number of open requests to PageServer',
+  values: [
+    'pageserver_open_requests',
+  ],
+  query_ref: 'neon_perf_counters',
+}

From be5d6a69dc6a05d339235d00958eb9fea7b0e9f5 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 15 Oct 2024 16:30:31 -0500
Subject: [PATCH 17/57] Fix jsonnet_files wildcard

Just a typo in a path.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/compute/Makefile b/compute/Makefile
index 45fbfa6d5e..b407fc60be 100644
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -1,4 +1,6 @@
-jsonnet_files = $(wildcard etc/*.jsonnet etc/*.libsonnet)
+jsonnet_files = $(wildcard \
+	etc/*.jsonnet \
+	etc/sql_exporter/*.libsonnet)
 
 .PHONY: all
 all: neon_collector.yml neon_collector_autoscaling.yml sql_exporter.yml sql_exporter_autoscaling.yml

From 061ea0de7a9768716d941e2e3472f19e075a5ce5 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 15 Oct 2024 20:01:13 -0500
Subject: [PATCH 18/57] Add jsonnetfmt targets

This should make it a little bit easier for people wanting to check if
their files are formated correctly. Has the added bonus of making the CI
check simpler as well.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 .github/workflows/build_and_test.yml | 3 +--
 compute/Makefile                     | 8 ++++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index c9a447626f..faee1d89e1 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -136,8 +136,7 @@ jobs:
 
       - name: Check Jsonnet code formatting
         run: |
-          jsonnetfmt --test \
-            $(find . -type f -name '*.jsonnet' -o -name '*.libsonnet')
+          make -C compute jsonnetfmt-test
 
   # Check that the vendor/postgres-* submodules point to the
   # corresponding REL_*_STABLE_neon branches.
diff --git a/compute/Makefile b/compute/Makefile
index b407fc60be..f8faa882ee 100644
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -35,3 +35,11 @@ clean:
 		etc/neon_collector_autoscaling.yml \
 		etc/sql_exporter.yml \
 		etc/sql_exporter_autoscaling.yml
+
+.PHONY: jsonnetfmt-test
+jsonnetfmt-test:
+	jsonnetfmt --test $(jsonnet_files)
+
+.PHONY: jsonnetfmt-format
+jsonnetfmt-format:
+	jsonnetfmt --in-place $(jsonnet_files)

From bc6b8cee01cc4055332fef052c048856612bcbab Mon Sep 17 00:00:00 2001
From: Cihan Demirci <128653800+fcdm@users.noreply.github.com>
Date: Wed, 16 Oct 2024 10:43:48 +0100
Subject: [PATCH 19/57] don't trigger workflows in two repos (#9340)

https://github.com/neondatabase/cloud/issues/16723
---
 .github/workflows/build_and_test.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index faee1d89e1..b669eaeb11 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1100,7 +1100,6 @@ jobs:
         run: |
           if [[ "$GITHUB_REF_NAME" == "main" ]]; then
             gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
-            gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}}
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
             gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
               -f deployPgSniRouter=false \

From 89a65a9e5a30c7525d165d1a9c2675d05811bfcb Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 16 Oct 2024 13:39:58 +0100
Subject: [PATCH 20/57] pageserver: improve handling of archival_config calls
 during Timeline shutdown (#9415)

## Problem

In test `test_timeline_offloading`, we see failures like:
```
PageserverApiException: queue is in state Stopped
```

Example failure:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/11356917668/index.html#testresult/ff0e348a78a974ee/retries

## Summary of changes

- Amend code paths that handle errors from RemoteTimelineClient to check
for cancellation and emit the Cancelled error variant in these cases
(will give clients a 503 to retry)
- Remove the implicit `#[from]` for the Other error case, to make it
harder to add code that accidentally squashes errors into this
(500-equivalent) error variant.

This would be neater if we made RemoteTimelineClient return a structured
error instead of anyhow::Error, but that's a bigger refactor.

I'm not sure if the test really intends to hit this path, but the error
handling fix makes sense either way.
---
 pageserver/src/tenant.rs | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 44d1bb74ca..20925c7fd6 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -67,7 +67,7 @@ use self::metadata::TimelineMetadata;
 use self::mgr::GetActiveTenantError;
 use self::mgr::GetTenantError;
 use self::remote_timeline_client::upload::upload_index_part;
-use self::remote_timeline_client::RemoteTimelineClient;
+use self::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError};
 use self::timeline::uninit::TimelineCreateGuard;
 use self::timeline::uninit::TimelineExclusionError;
 use self::timeline::uninit::UninitializedTimeline;
@@ -632,7 +632,7 @@ pub enum TimelineArchivalError {
     AlreadyInProgress,
 
     #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    Other(anyhow::Error),
 }
 
 impl Debug for TimelineArchivalError {
@@ -1602,7 +1602,8 @@ impl Tenant {
                 "failed to load remote timeline {} for tenant {}",
                 timeline_id, self.tenant_shard_id
             )
-        })?;
+        })
+        .map_err(TimelineArchivalError::Other)?;
         let timelines = self.timelines.lock().unwrap();
         if let Some(timeline) = timelines.get(&timeline_id) {
             let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap();
@@ -1672,9 +1673,19 @@ impl Tenant {
         };
 
         // Third part: upload new timeline archival state and block until it is present in S3
-        let upload_needed = timeline
+        let upload_needed = match timeline
             .remote_client
-            .schedule_index_upload_for_timeline_archival_state(new_state)?;
+            .schedule_index_upload_for_timeline_archival_state(new_state)
+        {
+            Ok(upload_needed) => upload_needed,
+            Err(e) => {
+                if timeline.cancel.is_cancelled() {
+                    return Err(TimelineArchivalError::Cancelled);
+                } else {
+                    return Err(TimelineArchivalError::Other(e));
+                }
+            }
+        };
 
         if upload_needed {
             info!("Uploading new state");
@@ -1685,7 +1696,14 @@ impl Tenant {
                 tracing::warn!("reached timeout for waiting on upload queue");
                 return Err(TimelineArchivalError::Timeout);
             };
-            v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?;
+            v.map_err(|e| match e {
+                WaitCompletionError::NotInitialized(e) => {
+                    TimelineArchivalError::Other(anyhow::anyhow!(e))
+                }
+                WaitCompletionError::UploadQueueShutDownOrStopped => {
+                    TimelineArchivalError::Cancelled
+                }
+            })?;
         }
         Ok(())
     }

From f14e45f0cee38bfbbbf1141d486fdd8edfbcc2f2 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 16 Oct 2024 15:01:56 +0200
Subject: [PATCH 21/57] proxy: format imports with nightly rustfmt (#9414)

```shell
cargo +nightly fmt -p proxy -- -l --config imports_granularity=Module,group_imports=StdExternalCrate,reorder_imports=true
```

These rust-analyzer settings for VSCode should help retain this style:
```json
  "rust-analyzer.imports.group.enable": true,
  "rust-analyzer.imports.prefix": "crate",
  "rust-analyzer.imports.merge.glob": false,
  "rust-analyzer.imports.granularity.group": "module",
  "rust-analyzer.imports.granularity.enforce": true,
```
---
 proxy/src/auth/backend/classic.rs             | 19 +++--
 proxy/src/auth/backend/console_redirect.rs    | 21 +++--
 proxy/src/auth/backend/hacks.rs               | 19 +++--
 proxy/src/auth/backend/jwt.rs                 | 39 +++++----
 proxy/src/auth/backend/local.rs               | 19 ++---
 proxy/src/auth/backend/mod.rs                 | 61 ++++++--------
 proxy/src/auth/credentials.rs                 | 25 +++---
 proxy/src/auth/flow.rs                        | 25 +++---
 proxy/src/auth/mod.rs                         | 12 +--
 proxy/src/bin/local_proxy.rs                  | 50 ++++++------
 proxy/src/bin/pg_sni_router.rs                | 16 ++--
 proxy/src/bin/proxy.rs                        | 51 +++++-------
 proxy/src/cache/endpoints.rs                  | 34 +++-----
 proxy/src/cache/project_info.rs               | 27 +++----
 proxy/src/cache/timed_lru.rs                  | 13 ++-
 proxy/src/cancellation.rs                     | 14 ++--
 proxy/src/compute.rs                          | 30 ++++---
 proxy/src/config.rs                           | 37 ++++-----
 proxy/src/console_redirect_proxy.rs           | 29 +++----
 proxy/src/context/mod.rs                      | 21 ++---
 proxy/src/context/parquet.rs                  | 49 ++++++------
 proxy/src/control_plane/messages.rs           |  9 ++-
 proxy/src/control_plane/mgmt.rs               | 10 +--
 proxy/src/control_plane/provider/mock.rs      | 39 ++++-----
 proxy/src/control_plane/provider/mod.rs       | 47 +++++------
 proxy/src/control_plane/provider/neon.rs      | 42 +++++-----
 proxy/src/error.rs                            |  3 +-
 proxy/src/http/health_server.rs               | 25 +++---
 proxy/src/http/mod.rs                         | 19 +++--
 proxy/src/intern.rs                           | 14 ++--
 proxy/src/jemalloc.rs                         | 16 ++--
 proxy/src/logging.rs                          | 16 ++--
 proxy/src/metrics.rs                          |  8 +-
 proxy/src/protocol2.rs                        | 10 +--
 proxy/src/proxy/connect_compute.rs            | 29 ++++---
 proxy/src/proxy/copy_bidirectional.rs         |  9 ++-
 proxy/src/proxy/handshake.rs                  | 20 +++--
 proxy/src/proxy/mod.rs                        | 36 ++++-----
 proxy/src/proxy/passthrough.rs                | 14 ++--
 proxy/src/proxy/retry.rs                      |  8 +-
 proxy/src/proxy/tests/mitm.rs                 |  3 +-
 proxy/src/proxy/tests/mod.rs                  | 22 ++---
 proxy/src/proxy/wake_compute.rs               | 11 +--
 proxy/src/rate_limiter/leaky_bucket.rs        |  6 +-
 proxy/src/rate_limiter/limit_algorithm.rs     | 12 +--
 .../src/rate_limiter/limit_algorithm/aimd.rs  |  3 +-
 proxy/src/rate_limiter/limiter.rs             | 24 +++---
 proxy/src/rate_limiter/mod.rs                 |  4 +-
 proxy/src/redis/cancellation_publisher.rs     |  7 +-
 .../connection_with_credentials_provider.rs   |  9 +--
 proxy/src/redis/notifications.rs              | 17 ++--
 proxy/src/sasl/messages.rs                    |  3 +-
 proxy/src/sasl/mod.rs                         |  5 +-
 proxy/src/sasl/stream.rs                      |  7 +-
 proxy/src/scram/countmin.rs                   |  4 +-
 proxy/src/scram/exchange.rs                   |  3 +-
 proxy/src/scram/messages.rs                   |  5 +-
 proxy/src/scram/mod.rs                        | 15 ++--
 proxy/src/scram/pbkdf2.rs                     | 10 +--
 proxy/src/scram/threadpool.rs                 | 32 +++-----
 proxy/src/serverless/backend.rs               | 58 ++++++--------
 proxy/src/serverless/cancel_set.rs            |  8 +-
 proxy/src/serverless/conn_pool.rs             | 44 +++++-----
 proxy/src/serverless/http_conn_pool.rs        | 17 ++--
 proxy/src/serverless/http_util.rs             |  7 +-
 proxy/src/serverless/json.rs                  |  9 +--
 proxy/src/serverless/local_conn_pool.rs       | 25 +++---
 proxy/src/serverless/mod.rs                   | 19 +++--
 proxy/src/serverless/sql_over_http.rs         | 80 ++++++-------------
 proxy/src/serverless/websocket.rs             | 41 ++++------
 proxy/src/stream.rs                           | 17 ++--
 proxy/src/usage_metrics.rs                    | 41 +++++-----
 proxy/src/waiters.rs                          |  8 +-
 73 files changed, 726 insertions(+), 835 deletions(-)

diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index 94b84b6f00..de32a06e9e 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -1,16 +1,15 @@
-use super::{ComputeCredentials, ComputeUserInfo};
-use crate::{
-    auth::{self, backend::ComputeCredentialKeys, AuthFlow},
-    compute,
-    config::AuthenticationConfig,
-    context::RequestMonitoring,
-    control_plane::AuthSecret,
-    sasl,
-    stream::{PqStream, Stream},
-};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
 
+use super::{ComputeCredentials, ComputeUserInfo};
+use crate::auth::backend::ComputeCredentialKeys;
+use crate::auth::{self, AuthFlow};
+use crate::config::AuthenticationConfig;
+use crate::context::RequestMonitoring;
+use crate::control_plane::AuthSecret;
+use crate::stream::{PqStream, Stream};
+use crate::{compute, sasl};
+
 pub(super) async fn authenticate(
     ctx: &RequestMonitoring,
     creds: ComputeUserInfo,
diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index 457410ec8c..255e1fed54 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -1,15 +1,3 @@
-use crate::{
-    auth,
-    cache::Cached,
-    compute,
-    config::AuthenticationConfig,
-    context::RequestMonitoring,
-    control_plane::{self, provider::NodeInfo, CachedNodeInfo},
-    error::{ReportableError, UserFacingError},
-    proxy::connect_compute::ComputeConnectBackend,
-    stream::PqStream,
-    waiters,
-};
 use async_trait::async_trait;
 use pq_proto::BeMessage as Be;
 use thiserror::Error;
@@ -18,6 +6,15 @@ use tokio_postgres::config::SslMode;
 use tracing::{info, info_span};
 
 use super::ComputeCredentialKeys;
+use crate::cache::Cached;
+use crate::config::AuthenticationConfig;
+use crate::context::RequestMonitoring;
+use crate::control_plane::provider::NodeInfo;
+use crate::control_plane::{self, CachedNodeInfo};
+use crate::error::{ReportableError, UserFacingError};
+use crate::proxy::connect_compute::ComputeConnectBackend;
+use crate::stream::PqStream;
+use crate::{auth, compute, waiters};
 
 #[derive(Debug, Error)]
 pub(crate) enum WebAuthError {
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index 749218d260..8ab8d5d37f 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -1,16 +1,15 @@
-use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint};
-use crate::{
-    auth::{self, AuthFlow},
-    config::AuthenticationConfig,
-    context::RequestMonitoring,
-    control_plane::AuthSecret,
-    intern::EndpointIdInt,
-    sasl,
-    stream::{self, Stream},
-};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
 
+use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint};
+use crate::auth::{self, AuthFlow};
+use crate::config::AuthenticationConfig;
+use crate::context::RequestMonitoring;
+use crate::control_plane::AuthSecret;
+use crate::intern::EndpointIdInt;
+use crate::sasl;
+use crate::stream::{self, Stream};
+
 /// Compared to [SCRAM](crate::scram), cleartext password auth saves
 /// one round trip and *expensive* computations (>= 4096 HMAC iterations).
 /// These properties are benefical for serverless JS workers, so we
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 402e59fdb3..3f53ee24c3 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -1,22 +1,22 @@
-use std::{
-    future::Future,
-    sync::Arc,
-    time::{Duration, SystemTime},
-};
+use std::future::Future;
+use std::sync::Arc;
+use std::time::{Duration, SystemTime};
 
 use arc_swap::ArcSwapOption;
 use dashmap::DashMap;
 use jose_jwk::crypto::KeyInfo;
-use serde::{de::Visitor, Deserialize, Deserializer};
+use serde::de::Visitor;
+use serde::{Deserialize, Deserializer};
 use signature::Verifier;
 use thiserror::Error;
 use tokio::time::Instant;
 
-use crate::{
-    auth::backend::ComputeCredentialKeys, context::RequestMonitoring,
-    control_plane::errors::GetEndpointJwksError, http::parse_json_body_with_limit,
-    intern::RoleNameInt, EndpointId, RoleName,
-};
+use crate::auth::backend::ComputeCredentialKeys;
+use crate::context::RequestMonitoring;
+use crate::control_plane::errors::GetEndpointJwksError;
+use crate::http::parse_json_body_with_limit;
+use crate::intern::RoleNameInt;
+use crate::{EndpointId, RoleName};
 
 // TODO(conrad): make these configurable.
 const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30);
@@ -381,10 +381,8 @@ fn verify_rsa_signature(
     alg: &jose_jwa::Algorithm,
 ) -> Result<(), JwtError> {
     use jose_jwa::{Algorithm, Signing};
-    use rsa::{
-        pkcs1v15::{Signature, VerifyingKey},
-        RsaPublicKey,
-    };
+    use rsa::pkcs1v15::{Signature, VerifyingKey};
+    use rsa::RsaPublicKey;
 
     let key = RsaPublicKey::try_from(key).map_err(JwtError::InvalidRsaKey)?;
 
@@ -655,11 +653,9 @@ impl From<&jose_jwk::Key> for KeyType {
 
 #[cfg(test)]
 mod tests {
-    use crate::RoleName;
-
-    use super::*;
-
-    use std::{future::IntoFuture, net::SocketAddr, time::SystemTime};
+    use std::future::IntoFuture;
+    use std::net::SocketAddr;
+    use std::time::SystemTime;
 
     use base64::URL_SAFE_NO_PAD;
     use bytes::Bytes;
@@ -672,6 +668,9 @@ mod tests {
     use signature::Signer;
     use tokio::net::TcpListener;
 
+    use super::*;
+    use crate::RoleName;
+
     fn new_ec_jwk(kid: String) -> (p256::SecretKey, jose_jwk::Jwk) {
         let sk = p256::SecretKey::random(&mut OsRng);
         let pk = sk.public_key().into();
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index 1dea4d2d73..e3995ac6c0 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -2,19 +2,14 @@ use std::net::SocketAddr;
 
 use arc_swap::ArcSwapOption;
 
-use crate::{
-    auth::backend::jwt::FetchAuthRulesError,
-    compute::ConnCfg,
-    context::RequestMonitoring,
-    control_plane::{
-        messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo},
-        NodeInfo,
-    },
-    intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag},
-    EndpointId,
-};
-
 use super::jwt::{AuthRule, FetchAuthRules};
+use crate::auth::backend::jwt::FetchAuthRulesError;
+use crate::compute::ConnCfg;
+use crate::context::RequestMonitoring;
+use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo};
+use crate::control_plane::NodeInfo;
+use crate::intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag};
+use crate::EndpointId;
 
 pub struct LocalBackend {
     pub(crate) node_info: NodeInfo,
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 7cf158bcd9..a4db130b61 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -17,29 +17,22 @@ use tokio_postgres::config::AuthKeys;
 use tracing::{info, warn};
 
 use crate::auth::credentials::check_peer_addr_is_in_list;
-use crate::auth::{validate_password_and_exchange, AuthError};
+use crate::auth::{self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint};
 use crate::cache::Cached;
+use crate::config::AuthenticationConfig;
 use crate::context::RequestMonitoring;
 use crate::control_plane::errors::GetAuthInfoError;
-use crate::control_plane::provider::{CachedRoleSecret, ControlPlaneBackend};
-use crate::control_plane::AuthSecret;
+use crate::control_plane::provider::{
+    CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneBackend,
+};
+use crate::control_plane::{self, Api, AuthSecret};
 use crate::intern::EndpointIdInt;
 use crate::metrics::Metrics;
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
 use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo};
 use crate::stream::Stream;
-use crate::{
-    auth::{self, ComputeUserInfoMaybeEndpoint},
-    config::AuthenticationConfig,
-    control_plane::{
-        self,
-        provider::{CachedAllowedIps, CachedNodeInfo},
-        Api,
-    },
-    stream,
-};
-use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
+use crate::{scram, stream, EndpointCacheKey, EndpointId, RoleName};
 
 /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
 pub enum MaybeOwned<'a, T> {
@@ -500,34 +493,32 @@ impl ComputeConnectBackend for Backend<'_, ComputeCredentials> {
 
 #[cfg(test)]
 mod tests {
-    use std::{net::IpAddr, sync::Arc, time::Duration};
+    use std::net::IpAddr;
+    use std::sync::Arc;
+    use std::time::Duration;
 
     use bytes::BytesMut;
     use fallible_iterator::FallibleIterator;
     use once_cell::sync::Lazy;
-    use postgres_protocol::{
-        authentication::sasl::{ChannelBinding, ScramSha256},
-        message::{backend::Message as PgMessage, frontend},
-    };
+    use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
+    use postgres_protocol::message::backend::Message as PgMessage;
+    use postgres_protocol::message::frontend;
     use provider::AuthSecret;
     use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt};
 
-    use crate::{
-        auth::{backend::MaskedIp, ComputeUserInfoMaybeEndpoint, IpPattern},
-        config::AuthenticationConfig,
-        context::RequestMonitoring,
-        control_plane::{
-            self,
-            provider::{self, CachedAllowedIps, CachedRoleSecret},
-            CachedNodeInfo,
-        },
-        proxy::NeonOptions,
-        rate_limiter::{EndpointRateLimiter, RateBucketInfo},
-        scram::{threadpool::ThreadPool, ServerSecret},
-        stream::{PqStream, Stream},
-    };
-
-    use super::{auth_quirks, jwt::JwkCache, AuthRateLimiter};
+    use super::jwt::JwkCache;
+    use super::{auth_quirks, AuthRateLimiter};
+    use crate::auth::backend::MaskedIp;
+    use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern};
+    use crate::config::AuthenticationConfig;
+    use crate::context::RequestMonitoring;
+    use crate::control_plane::provider::{self, CachedAllowedIps, CachedRoleSecret};
+    use crate::control_plane::{self, CachedNodeInfo};
+    use crate::proxy::NeonOptions;
+    use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo};
+    use crate::scram::threadpool::ThreadPool;
+    use crate::scram::ServerSecret;
+    use crate::stream::{PqStream, Stream};
 
     struct Auth {
         ips: Vec<IpPattern>,
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index cba8601d14..fa6bc4c6f5 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,20 +1,22 @@
 //! User credentials used in authentication.
 
-use crate::{
-    auth::password_hack::parse_endpoint_param,
-    context::RequestMonitoring,
-    error::{ReportableError, UserFacingError},
-    metrics::{Metrics, SniKind},
-    proxy::NeonOptions,
-    serverless::SERVERLESS_DRIVER_SNI,
-    EndpointId, RoleName,
-};
+use std::collections::HashSet;
+use std::net::IpAddr;
+use std::str::FromStr;
+
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
-use std::{collections::HashSet, net::IpAddr, str::FromStr};
 use thiserror::Error;
 use tracing::{info, warn};
 
+use crate::auth::password_hack::parse_endpoint_param;
+use crate::context::RequestMonitoring;
+use crate::error::{ReportableError, UserFacingError};
+use crate::metrics::{Metrics, SniKind};
+use crate::proxy::NeonOptions;
+use crate::serverless::SERVERLESS_DRIVER_SNI;
+use crate::{EndpointId, RoleName};
+
 #[derive(Debug, Error, PartialEq, Eq, Clone)]
 pub(crate) enum ComputeUserInfoParseError {
     #[error("Parameter '{0}' is missing in startup packet.")]
@@ -249,10 +251,11 @@ fn project_name_valid(name: &str) -> bool {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use serde_json::json;
     use ComputeUserInfoParseError::*;
 
+    use super::*;
+
     #[test]
     fn parse_bare_minimum() -> anyhow::Result<()> {
         // According to postgresql, only `user` should be required.
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 9a5139dfb8..ccb17b66b9 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -1,21 +1,24 @@
 //! Main authentication flow.
 
-use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload};
-use crate::{
-    config::TlsServerEndPoint,
-    context::RequestMonitoring,
-    control_plane::AuthSecret,
-    intern::EndpointIdInt,
-    sasl,
-    scram::{self, threadpool::ThreadPool},
-    stream::{PqStream, Stream},
-};
+use std::io;
+use std::sync::Arc;
+
 use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS};
 use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
-use std::{io, sync::Arc};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 
+use super::backend::ComputeCredentialKeys;
+use super::{AuthErrorImpl, PasswordHackPayload};
+use crate::config::TlsServerEndPoint;
+use crate::context::RequestMonitoring;
+use crate::control_plane::AuthSecret;
+use crate::intern::EndpointIdInt;
+use crate::sasl;
+use crate::scram::threadpool::ThreadPool;
+use crate::scram::{self};
+use crate::stream::{PqStream, Stream};
+
 /// Every authentication selector is supposed to implement this trait.
 pub(crate) trait AuthMethod {
     /// Any authentication selector should provide initial backend message
diff --git a/proxy/src/auth/mod.rs b/proxy/src/auth/mod.rs
index 0c8686add2..ff97e6c35d 100644
--- a/proxy/src/auth/mod.rs
+++ b/proxy/src/auth/mod.rs
@@ -14,15 +14,15 @@ pub(crate) use password_hack::parse_endpoint_param;
 use password_hack::PasswordHackPayload;
 
 mod flow;
+use std::io;
+use std::net::IpAddr;
+
 pub(crate) use flow::*;
+use thiserror::Error;
 use tokio::time::error::Elapsed;
 
-use crate::{
-    control_plane,
-    error::{ReportableError, UserFacingError},
-};
-use std::{io, net::IpAddr};
-use thiserror::Error;
+use crate::control_plane;
+use crate::error::{ReportableError, UserFacingError};
 
 /// Convenience wrapper for the authentication error.
 pub(crate) type Result<T> = std::result::Result<T, AuthError>;
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index c92ebbc51f..e6bc369d9a 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -1,41 +1,43 @@
-use std::{net::SocketAddr, pin::pin, str::FromStr, sync::Arc, time::Duration};
+use std::net::SocketAddr;
+use std::pin::pin;
+use std::str::FromStr;
+use std::sync::Arc;
+use std::time::Duration;
 
 use anyhow::{bail, ensure, Context};
 use camino::{Utf8Path, Utf8PathBuf};
 use compute_api::spec::LocalProxySpec;
 use dashmap::DashMap;
 use futures::future::Either;
-use proxy::{
-    auth::{
-        self,
-        backend::{
-            jwt::JwkCache,
-            local::{LocalBackend, JWKS_ROLE_MAP},
-        },
-    },
-    cancellation::CancellationHandlerMain,
-    config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig},
-    control_plane::{
-        locks::ApiLocks,
-        messages::{EndpointJwksResponse, JwksSettings},
-    },
-    http::health_server::AppMetrics,
-    intern::RoleNameInt,
-    metrics::{Metrics, ThreadPoolMetrics},
-    rate_limiter::{BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo},
-    scram::threadpool::ThreadPool,
-    serverless::{self, cancel_set::CancelSet, GlobalConnPoolOptions},
-    RoleName,
+use proxy::auth::backend::jwt::JwkCache;
+use proxy::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP};
+use proxy::auth::{self};
+use proxy::cancellation::CancellationHandlerMain;
+use proxy::config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig};
+use proxy::control_plane::locks::ApiLocks;
+use proxy::control_plane::messages::{EndpointJwksResponse, JwksSettings};
+use proxy::http::health_server::AppMetrics;
+use proxy::intern::RoleNameInt;
+use proxy::metrics::{Metrics, ThreadPoolMetrics};
+use proxy::rate_limiter::{
+    BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo,
 };
+use proxy::scram::threadpool::ThreadPool;
+use proxy::serverless::cancel_set::CancelSet;
+use proxy::serverless::{self, GlobalConnPoolOptions};
+use proxy::RoleName;
 
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
 
 use clap::Parser;
-use tokio::{net::TcpListener, sync::Notify, task::JoinSet};
+use tokio::net::TcpListener;
+use tokio::sync::Notify;
+use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};
-use utils::{pid_file, project_build_tag, project_git_version, sentry_init::init_sentry};
+use utils::sentry_init::init_sentry;
+use utils::{pid_file, project_build_tag, project_git_version};
 
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 53f1586abe..00eb830d98 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -5,25 +5,23 @@
 /// the outside. Similar to an ingress controller for HTTPS.
 use std::{net::SocketAddr, sync::Arc};
 
+use anyhow::{anyhow, bail, ensure, Context};
+use clap::Arg;
 use futures::future::Either;
+use futures::TryFutureExt;
 use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
 use proxy::context::RequestMonitoring;
 use proxy::metrics::{Metrics, ThreadPoolMetrics};
 use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
-use rustls::pki_types::PrivateKeyDer;
-use tokio::net::TcpListener;
-
-use anyhow::{anyhow, bail, ensure, Context};
-use clap::Arg;
-use futures::TryFutureExt;
 use proxy::stream::{PqStream, Stream};
-
+use rustls::pki_types::PrivateKeyDer;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::net::TcpListener;
 use tokio_util::sync::CancellationToken;
-use utils::{project_git_version, sentry_init::init_sentry};
-
 use tracing::{error, info, Instrument};
+use utils::project_git_version;
+use utils::sentry_init::init_sentry;
 
 project_git_version!(GIT_VERSION);
 
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 3c0e66dec3..96a71e69c6 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,3 +1,8 @@
+use std::net::SocketAddr;
+use std::pin::pin;
+use std::sync::Arc;
+
+use anyhow::bail;
 use aws_config::environment::EnvironmentVariableCredentialsProvider;
 use aws_config::imds::credentials::ImdsCredentialsProvider;
 use aws_config::meta::credentials::CredentialsProviderChain;
@@ -7,52 +12,34 @@ use aws_config::provider_config::ProviderConfig;
 use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
 use aws_config::Region;
 use futures::future::Either;
-use proxy::auth;
 use proxy::auth::backend::jwt::JwkCache;
-use proxy::auth::backend::AuthRateLimiter;
-use proxy::auth::backend::ConsoleRedirectBackend;
-use proxy::auth::backend::MaybeOwned;
-use proxy::cancellation::CancelMap;
-use proxy::cancellation::CancellationHandler;
-use proxy::config::remote_storage_from_toml;
-use proxy::config::AuthenticationConfig;
-use proxy::config::CacheOptions;
-use proxy::config::HttpConfig;
-use proxy::config::ProjectInfoCacheOptions;
-use proxy::config::ProxyProtocolV2;
+use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned};
+use proxy::cancellation::{CancelMap, CancellationHandler};
+use proxy::config::{
+    self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, HttpConfig,
+    ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2,
+};
 use proxy::context::parquet::ParquetUploadArgs;
-use proxy::control_plane;
-use proxy::http;
 use proxy::http::health_server::AppMetrics;
 use proxy::metrics::Metrics;
-use proxy::rate_limiter::EndpointRateLimiter;
-use proxy::rate_limiter::LeakyBucketConfig;
-use proxy::rate_limiter::RateBucketInfo;
-use proxy::rate_limiter::WakeComputeRateLimiter;
+use proxy::rate_limiter::{
+    EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, WakeComputeRateLimiter,
+};
 use proxy::redis::cancellation_publisher::RedisPublisherClient;
 use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
-use proxy::redis::elasticache;
-use proxy::redis::notifications;
+use proxy::redis::{elasticache, notifications};
 use proxy::scram::threadpool::ThreadPool;
 use proxy::serverless::cancel_set::CancelSet;
 use proxy::serverless::GlobalConnPoolOptions;
-use proxy::usage_metrics;
-
-use anyhow::bail;
-use proxy::config::{self, ProxyConfig};
-use proxy::serverless;
+use proxy::{auth, control_plane, http, serverless, usage_metrics};
 use remote_storage::RemoteStorageConfig;
-use std::net::SocketAddr;
-use std::pin::pin;
-use std::sync::Arc;
 use tokio::net::TcpListener;
 use tokio::sync::Mutex;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
-use tracing::info;
-use tracing::warn;
-use tracing::Instrument;
-use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};
+use tracing::{info, warn, Instrument};
+use utils::sentry_init::init_sentry;
+use utils::{project_build_tag, project_git_version};
 
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 27121ce89e..82f3247fa7 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -1,31 +1,23 @@
-use std::{
-    convert::Infallible,
-    sync::{
-        atomic::{AtomicBool, Ordering},
-        Arc,
-    },
-    time::Duration,
-};
+use std::convert::Infallible;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
+use std::time::Duration;
 
 use dashmap::DashSet;
-use redis::{
-    streams::{StreamReadOptions, StreamReadReply},
-    AsyncCommands, FromRedisValue, Value,
-};
+use redis::streams::{StreamReadOptions, StreamReadReply};
+use redis::{AsyncCommands, FromRedisValue, Value};
 use serde::Deserialize;
 use tokio::sync::Mutex;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 
-use crate::{
-    config::EndpointCacheConfig,
-    context::RequestMonitoring,
-    intern::{BranchIdInt, EndpointIdInt, ProjectIdInt},
-    metrics::{Metrics, RedisErrors, RedisEventsCount},
-    rate_limiter::GlobalRateLimiter,
-    redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider,
-    EndpointId,
-};
+use crate::config::EndpointCacheConfig;
+use crate::context::RequestMonitoring;
+use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
+use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
+use crate::rate_limiter::GlobalRateLimiter;
+use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
+use crate::EndpointId;
 
 #[derive(Deserialize, Debug, Clone)]
 pub(crate) struct ControlPlaneEventKey {
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index b92cedb043..31d1dc96e7 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -1,9 +1,8 @@
-use std::{
-    collections::HashSet,
-    convert::Infallible,
-    sync::{atomic::AtomicU64, Arc},
-    time::Duration,
-};
+use std::collections::HashSet;
+use std::convert::Infallible;
+use std::sync::atomic::AtomicU64;
+use std::sync::Arc;
+use std::time::Duration;
 
 use async_trait::async_trait;
 use dashmap::DashMap;
@@ -13,15 +12,12 @@ use tokio::sync::Mutex;
 use tokio::time::Instant;
 use tracing::{debug, info};
 
-use crate::{
-    auth::IpPattern,
-    config::ProjectInfoCacheOptions,
-    control_plane::AuthSecret,
-    intern::{EndpointIdInt, ProjectIdInt, RoleNameInt},
-    EndpointId, RoleName,
-};
-
 use super::{Cache, Cached};
+use crate::auth::IpPattern;
+use crate::config::ProjectInfoCacheOptions;
+use crate::control_plane::AuthSecret;
+use crate::intern::{EndpointIdInt, ProjectIdInt, RoleNameInt};
+use crate::{EndpointId, RoleName};
 
 #[async_trait]
 pub(crate) trait ProjectInfoCache {
@@ -371,7 +367,8 @@ impl Cache for ProjectInfoCacheImpl {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::{scram::ServerSecret, ProjectId};
+    use crate::scram::ServerSecret;
+    use crate::ProjectId;
 
     #[tokio::test]
     async fn test_project_info_cache_settings() {
diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs
index 5b08d74696..06eaeb9a30 100644
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -1,9 +1,6 @@
-use std::{
-    borrow::Borrow,
-    hash::Hash,
-    time::{Duration, Instant},
-};
-use tracing::debug;
+use std::borrow::Borrow;
+use std::hash::Hash;
+use std::time::{Duration, Instant};
 
 // This seems to make more sense than `lru` or `cached`:
 //
@@ -15,8 +12,10 @@ use tracing::debug;
 //
 // On the other hand, `hashlink` has good download stats and appears to be maintained.
 use hashlink::{linked_hash_map::RawEntryMut, LruCache};
+use tracing::debug;
 
-use super::{common::Cached, timed_lru, Cache};
+use super::common::Cached;
+use super::{timed_lru, Cache};
 
 /// An implementation of timed LRU cache with fixed capacity.
 /// Key properties:
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 71a2a16af8..db0970adcb 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,6 +1,8 @@
+use std::net::SocketAddr;
+use std::sync::Arc;
+
 use dashmap::DashMap;
 use pq_proto::CancelKeyData;
-use std::{net::SocketAddr, sync::Arc};
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio::sync::Mutex;
@@ -8,12 +10,10 @@ use tokio_postgres::{CancelToken, NoTls};
 use tracing::info;
 use uuid::Uuid;
 
-use crate::{
-    error::ReportableError,
-    metrics::{CancellationRequest, CancellationSource, Metrics},
-    redis::cancellation_publisher::{
-        CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
-    },
+use crate::error::ReportableError;
+use crate::metrics::{CancellationRequest, CancellationSource, Metrics};
+use crate::redis::cancellation_publisher::{
+    CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
 };
 
 pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 006804fcd4..212e82497f 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,25 +1,31 @@
-use crate::{
-    auth::parse_endpoint_param,
-    cancellation::CancelClosure,
-    context::RequestMonitoring,
-    control_plane::{errors::WakeComputeError, messages::MetricsAuxInfo, provider::ApiLockError},
-    error::{ReportableError, UserFacingError},
-    metrics::{Metrics, NumDbConnectionsGuard},
-    proxy::neon_option,
-    Host,
-};
+use std::io;
+use std::net::SocketAddr;
+use std::sync::Arc;
+use std::time::Duration;
+
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pq_proto::StartupMessageParams;
-use rustls::{client::danger::ServerCertVerifier, pki_types::InvalidDnsNameError};
-use std::{io, net::SocketAddr, sync::Arc, time::Duration};
+use rustls::client::danger::ServerCertVerifier;
+use rustls::pki_types::InvalidDnsNameError;
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio_postgres::tls::MakeTlsConnect;
 use tokio_postgres_rustls::MakeRustlsConnect;
 use tracing::{error, info, warn};
 
+use crate::auth::parse_endpoint_param;
+use crate::cancellation::CancelClosure;
+use crate::context::RequestMonitoring;
+use crate::control_plane::errors::WakeComputeError;
+use crate::control_plane::messages::MetricsAuxInfo;
+use crate::control_plane::provider::ApiLockError;
+use crate::error::{ReportableError, UserFacingError};
+use crate::metrics::{Metrics, NumDbConnectionsGuard};
+use crate::proxy::neon_option;
+use crate::Host;
+
 pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
 
 #[derive(Debug, Error)]
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index c068fc50fb..2ec8c7adda 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,29 +1,27 @@
-use crate::{
-    auth::backend::{jwt::JwkCache, AuthRateLimiter},
-    control_plane::locks::ApiLocks,
-    rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig},
-    scram::threadpool::ThreadPool,
-    serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
-    Host,
-};
+use std::collections::{HashMap, HashSet};
+use std::str::FromStr;
+use std::sync::Arc;
+use std::time::Duration;
+
 use anyhow::{bail, ensure, Context, Ok};
 use clap::ValueEnum;
 use itertools::Itertools;
 use remote_storage::RemoteStorageConfig;
-use rustls::{
-    crypto::ring::sign,
-    pki_types::{CertificateDer, PrivateKeyDer},
-};
+use rustls::crypto::ring::sign;
+use rustls::pki_types::{CertificateDer, PrivateKeyDer};
 use sha2::{Digest, Sha256};
-use std::{
-    collections::{HashMap, HashSet},
-    str::FromStr,
-    sync::Arc,
-    time::Duration,
-};
 use tracing::{error, info};
 use x509_parser::oid_registry;
 
+use crate::auth::backend::jwt::JwkCache;
+use crate::auth::backend::AuthRateLimiter;
+use crate::control_plane::locks::ApiLocks;
+use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig};
+use crate::scram::threadpool::ThreadPool;
+use crate::serverless::cancel_set::CancelSet;
+use crate::serverless::GlobalConnPoolOptions;
+use crate::Host;
+
 pub struct ProxyConfig {
     pub tls_config: Option<TlsConfig>,
     pub metric_collection: Option<MetricCollectionConfig>,
@@ -692,9 +690,8 @@ impl FromStr for ConcurrencyLockOptions {
 
 #[cfg(test)]
 mod tests {
-    use crate::rate_limiter::Aimd;
-
     use super::*;
+    use crate::rate_limiter::Aimd;
 
     #[test]
     fn test_parse_cache_options() -> anyhow::Result<()> {
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 9e17976720..81d1d70958 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -1,25 +1,22 @@
-use crate::auth::backend::ConsoleRedirectBackend;
-use crate::config::{ProxyConfig, ProxyProtocolV2};
-use crate::proxy::{
-    prepare_client_connection, run_until_cancelled, ClientRequestError, ErrorSource,
-};
-use crate::{
-    cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal},
-    context::RequestMonitoring,
-    error::ReportableError,
-    metrics::{Metrics, NumClientConnectionsGuard},
-    protocol2::read_proxy_protocol,
-    proxy::handshake::{handshake, HandshakeData},
-};
-use futures::TryFutureExt;
 use std::sync::Arc;
+
+use futures::TryFutureExt;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, Instrument};
 
+use crate::auth::backend::ConsoleRedirectBackend;
+use crate::cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal};
+use crate::config::{ProxyConfig, ProxyProtocolV2};
+use crate::context::RequestMonitoring;
+use crate::error::ReportableError;
+use crate::metrics::{Metrics, NumClientConnectionsGuard};
+use crate::protocol2::read_proxy_protocol;
+use crate::proxy::connect_compute::{connect_to_compute, TcpMechanism};
+use crate::proxy::handshake::{handshake, HandshakeData};
+use crate::proxy::passthrough::ProxyPassthrough;
 use crate::proxy::{
-    connect_compute::{connect_to_compute, TcpMechanism},
-    passthrough::ProxyPassthrough,
+    prepare_client_connection, run_until_cancelled, ClientRequestError, ErrorSource,
 };
 
 pub async fn task_main(
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index 7fb4e7c698..e2d2c1b766 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -1,24 +1,25 @@
 //! Connection request monitoring contexts
 
+use std::net::IpAddr;
+
 use chrono::Utc;
 use once_cell::sync::OnceCell;
 use pq_proto::StartupMessageParams;
 use smol_str::SmolStr;
-use std::net::IpAddr;
 use tokio::sync::mpsc;
-use tracing::{debug, field::display, info, info_span, Span};
+use tracing::field::display;
+use tracing::{debug, info, info_span, Span};
 use try_lock::TryLock;
 use uuid::Uuid;
 
-use crate::{
-    control_plane::messages::{ColdStartInfo, MetricsAuxInfo},
-    error::ErrorKind,
-    intern::{BranchIdInt, ProjectIdInt},
-    metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting},
-    DbName, EndpointId, RoleName,
-};
-
 use self::parquet::RequestData;
+use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
+use crate::error::ErrorKind;
+use crate::intern::{BranchIdInt, ProjectIdInt};
+use crate::metrics::{
+    ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting,
+};
+use crate::{DbName, EndpointId, RoleName};
 
 pub mod parquet;
 
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 9f6f83022e..b0ad0e4566 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -1,29 +1,28 @@
-use std::{sync::Arc, time::SystemTime};
+use std::sync::Arc;
+use std::time::SystemTime;
 
 use anyhow::Context;
-use bytes::{buf::Writer, BufMut, BytesMut};
+use bytes::buf::Writer;
+use bytes::{BufMut, BytesMut};
 use chrono::{Datelike, Timelike};
 use futures::{Stream, StreamExt};
-use parquet::{
-    basic::Compression,
-    file::{
-        metadata::RowGroupMetaDataPtr,
-        properties::{WriterProperties, WriterPropertiesPtr, DEFAULT_PAGE_SIZE},
-        writer::SerializedFileWriter,
-    },
-    record::RecordWriter,
-};
+use parquet::basic::Compression;
+use parquet::file::metadata::RowGroupMetaDataPtr;
+use parquet::file::properties::{WriterProperties, WriterPropertiesPtr, DEFAULT_PAGE_SIZE};
+use parquet::file::writer::SerializedFileWriter;
+use parquet::record::RecordWriter;
 use pq_proto::StartupMessageParams;
 use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel};
 use serde::ser::SerializeMap;
-use tokio::{sync::mpsc, time};
+use tokio::sync::mpsc;
+use tokio::time;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
 use utils::backoff;
 
-use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT};
-
 use super::{RequestMonitoringInner, LOG_CHAN};
+use crate::config::remote_storage_from_toml;
+use crate::context::LOG_CHAN_DISCONNECT;
 
 #[derive(clap::Args, Clone, Debug)]
 pub struct ParquetUploadArgs {
@@ -407,26 +406,26 @@ async fn upload_parquet(
 
 #[cfg(test)]
 mod tests {
-    use std::{net::Ipv4Addr, num::NonZeroUsize, sync::Arc};
+    use std::net::Ipv4Addr;
+    use std::num::NonZeroUsize;
+    use std::sync::Arc;
 
     use camino::Utf8Path;
     use clap::Parser;
     use futures::{Stream, StreamExt};
     use itertools::Itertools;
-    use parquet::{
-        basic::{Compression, ZstdLevel},
-        file::{
-            properties::{WriterProperties, DEFAULT_PAGE_SIZE},
-            reader::FileReader,
-            serialized_reader::SerializedFileReader,
-        },
-    };
-    use rand::{rngs::StdRng, Rng, SeedableRng};
+    use parquet::basic::{Compression, ZstdLevel};
+    use parquet::file::properties::{WriterProperties, DEFAULT_PAGE_SIZE};
+    use parquet::file::reader::FileReader;
+    use parquet::file::serialized_reader::SerializedFileReader;
+    use rand::rngs::StdRng;
+    use rand::{Rng, SeedableRng};
     use remote_storage::{
         GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config,
         DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
     };
-    use tokio::{sync::mpsc, time};
+    use tokio::sync::mpsc;
+    use tokio::time;
     use walkdir::WalkDir;
 
     use super::{worker_inner, ParquetConfig, ParquetUploadArgs, RequestData};
diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index 960bb5bc21..dae23f7c53 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -1,9 +1,9 @@
-use measured::FixedCardinalityLabel;
-use serde::{Deserialize, Serialize};
 use std::fmt::{self, Display};
 
-use crate::auth::IpPattern;
+use measured::FixedCardinalityLabel;
+use serde::{Deserialize, Serialize};
 
+use crate::auth::IpPattern;
 use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
 use crate::proxy::retry::CouldRetry;
 
@@ -362,9 +362,10 @@ pub struct JwksSettings {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use serde_json::json;
 
+    use super::*;
+
     fn dummy_aux() -> serde_json::Value {
         json!({
             "endpoint_id": "endpoint",
diff --git a/proxy/src/control_plane/mgmt.rs b/proxy/src/control_plane/mgmt.rs
index 2c4b5a9b94..5ac3acd28a 100644
--- a/proxy/src/control_plane/mgmt.rs
+++ b/proxy/src/control_plane/mgmt.rs
@@ -1,16 +1,16 @@
-use crate::{
-    control_plane::messages::{DatabaseInfo, KickSession},
-    waiters::{self, Waiter, Waiters},
-};
+use std::convert::Infallible;
+
 use anyhow::Context;
 use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
-use std::convert::Infallible;
 use tokio::net::{TcpListener, TcpStream};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, Instrument};
 
+use crate::control_plane::messages::{DatabaseInfo, KickSession};
+use crate::waiters::{self, Waiter, Waiters};
+
 static CPLANE_WAITERS: Lazy<Waiters<ComputeReady>> = Lazy::new(Default::default);
 
 /// Give caller an opportunity to wait for the cloud's reply.
diff --git a/proxy/src/control_plane/provider/mock.rs b/proxy/src/control_plane/provider/mock.rs
index 51cddec672..fb061376e7 100644
--- a/proxy/src/control_plane/provider/mock.rs
+++ b/proxy/src/control_plane/provider/mock.rs
@@ -1,28 +1,29 @@
 //! Mock console backend which relies on a user-provided postgres instance.
 
-use super::{
-    errors::{ApiError, GetAuthInfoError, WakeComputeError},
-    AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
-};
-use crate::{
-    auth::backend::jwt::AuthRule, context::RequestMonitoring,
-    control_plane::errors::GetEndpointJwksError, intern::RoleNameInt, RoleName,
-};
-use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
-use crate::{auth::IpPattern, cache::Cached};
-use crate::{
-    control_plane::{
-        messages::MetricsAuxInfo,
-        provider::{CachedAllowedIps, CachedRoleSecret},
-    },
-    BranchId, EndpointId, ProjectId,
-};
+use std::str::FromStr;
+use std::sync::Arc;
+
 use futures::TryFutureExt;
-use std::{str::FromStr, sync::Arc};
 use thiserror::Error;
-use tokio_postgres::{config::SslMode, Client};
+use tokio_postgres::config::SslMode;
+use tokio_postgres::Client;
 use tracing::{error, info, info_span, warn, Instrument};
 
+use super::errors::{ApiError, GetAuthInfoError, WakeComputeError};
+use super::{AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo};
+use crate::auth::backend::jwt::AuthRule;
+use crate::auth::backend::ComputeUserInfo;
+use crate::auth::IpPattern;
+use crate::cache::Cached;
+use crate::context::RequestMonitoring;
+use crate::control_plane::errors::GetEndpointJwksError;
+use crate::control_plane::messages::MetricsAuxInfo;
+use crate::control_plane::provider::{CachedAllowedIps, CachedRoleSecret};
+use crate::error::io_error;
+use crate::intern::RoleNameInt;
+use crate::url::ApiUrl;
+use crate::{compute, scram, BranchId, EndpointId, ProjectId, RoleName};
+
 #[derive(Debug, Error)]
 enum MockApiError {
     #[error("Failed to read password: {0}")]
diff --git a/proxy/src/control_plane/provider/mod.rs b/proxy/src/control_plane/provider/mod.rs
index 0a196fe2a3..a4a330cd5f 100644
--- a/proxy/src/control_plane/provider/mod.rs
+++ b/proxy/src/control_plane/provider/mod.rs
@@ -2,39 +2,36 @@
 pub mod mock;
 pub mod neon;
 
-use super::messages::{ControlPlaneError, MetricsAuxInfo};
-use crate::{
-    auth::{
-        backend::{
-            jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError},
-            ComputeCredentialKeys, ComputeUserInfo,
-        },
-        IpPattern,
-    },
-    cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru},
-    compute,
-    config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions},
-    context::RequestMonitoring,
-    error::ReportableError,
-    intern::ProjectIdInt,
-    metrics::ApiLockMetrics,
-    rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token},
-    scram, EndpointCacheKey, EndpointId,
-};
+use std::hash::Hash;
+use std::sync::Arc;
+use std::time::Duration;
+
 use dashmap::DashMap;
-use std::{hash::Hash, sync::Arc, time::Duration};
 use tokio::time::Instant;
 use tracing::info;
 
+use super::messages::{ControlPlaneError, MetricsAuxInfo};
+use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError};
+use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
+use crate::auth::IpPattern;
+use crate::cache::endpoints::EndpointsCache;
+use crate::cache::project_info::ProjectInfoCacheImpl;
+use crate::cache::{Cached, TimedLru};
+use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions};
+use crate::context::RequestMonitoring;
+use crate::error::ReportableError;
+use crate::intern::ProjectIdInt;
+use crate::metrics::ApiLockMetrics;
+use crate::rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token};
+use crate::{compute, scram, EndpointCacheKey, EndpointId};
+
 pub(crate) mod errors {
-    use crate::{
-        control_plane::messages::{self, ControlPlaneError, Reason},
-        error::{io_error, ErrorKind, ReportableError, UserFacingError},
-        proxy::retry::CouldRetry,
-    };
     use thiserror::Error;
 
     use super::ApiLockError;
+    use crate::control_plane::messages::{self, ControlPlaneError, Reason};
+    use crate::error::{io_error, ErrorKind, ReportableError, UserFacingError};
+    use crate::proxy::retry::CouldRetry;
 
     /// A go-to error message which doesn't leak any detail.
     pub(crate) const REQUEST_FAILED: &str = "Console request failed";
diff --git a/proxy/src/control_plane/provider/neon.rs b/proxy/src/control_plane/provider/neon.rs
index 2487ce0e3f..5d0692c7ca 100644
--- a/proxy/src/control_plane/provider/neon.rs
+++ b/proxy/src/control_plane/provider/neon.rs
@@ -1,31 +1,31 @@
 //! Production console backend.
 
-use super::{
-    super::messages::{ControlPlaneError, GetRoleSecret, WakeCompute},
-    errors::{ApiError, GetAuthInfoError, WakeComputeError},
-    ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret,
-    NodeInfo,
-};
-use crate::{
-    auth::backend::{jwt::AuthRule, ComputeUserInfo},
-    compute,
-    control_plane::{
-        errors::GetEndpointJwksError,
-        messages::{ColdStartInfo, EndpointJwksResponse, Reason},
-    },
-    http,
-    metrics::{CacheOutcome, Metrics},
-    rate_limiter::WakeComputeRateLimiter,
-    scram, EndpointCacheKey, EndpointId,
-};
-use crate::{cache::Cached, context::RequestMonitoring};
-use ::http::{header::AUTHORIZATION, HeaderName};
+use std::sync::Arc;
+use std::time::Duration;
+
+use ::http::header::AUTHORIZATION;
+use ::http::HeaderName;
 use futures::TryFutureExt;
-use std::{sync::Arc, time::Duration};
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{debug, info, info_span, warn, Instrument};
 
+use super::super::messages::{ControlPlaneError, GetRoleSecret, WakeCompute};
+use super::errors::{ApiError, GetAuthInfoError, WakeComputeError};
+use super::{
+    ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret,
+    NodeInfo,
+};
+use crate::auth::backend::jwt::AuthRule;
+use crate::auth::backend::ComputeUserInfo;
+use crate::cache::Cached;
+use crate::context::RequestMonitoring;
+use crate::control_plane::errors::GetEndpointJwksError;
+use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
+use crate::metrics::{CacheOutcome, Metrics};
+use crate::rate_limiter::WakeComputeRateLimiter;
+use crate::{compute, http, scram, EndpointCacheKey, EndpointId};
+
 const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
 
 #[derive(Clone)]
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index 1cd4dc2c22..e71ed0c048 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -1,4 +1,5 @@
-use std::{error::Error as StdError, fmt, io};
+use std::error::Error as StdError;
+use std::{fmt, io};
 
 use measured::FixedCardinalityLabel;
 
diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs
index d0352351d5..978ad9f761 100644
--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
@@ -1,19 +1,18 @@
+use std::convert::Infallible;
+use std::net::TcpListener;
+use std::sync::{Arc, Mutex};
+
 use anyhow::{anyhow, bail};
-use hyper0::{header::CONTENT_TYPE, Body, Request, Response, StatusCode};
-use measured::{text::BufferedTextEncoder, MetricGroup};
+use hyper0::header::CONTENT_TYPE;
+use hyper0::{Body, Request, Response, StatusCode};
+use measured::text::BufferedTextEncoder;
+use measured::MetricGroup;
 use metrics::NeonMetrics;
-use std::{
-    convert::Infallible,
-    net::TcpListener,
-    sync::{Arc, Mutex},
-};
 use tracing::{info, info_span};
-use utils::http::{
-    endpoint::{self, request_span},
-    error::ApiError,
-    json::json_response,
-    RouterBuilder, RouterService,
-};
+use utils::http::endpoint::{self, request_span};
+use utils::http::error::ApiError;
+use utils::http::json::json_response;
+use utils::http::{RouterBuilder, RouterService};
 
 use crate::jemalloc;
 
diff --git a/proxy/src/http/mod.rs b/proxy/src/http/mod.rs
index d8676d5b50..fd587e8f01 100644
--- a/proxy/src/http/mod.rs
+++ b/proxy/src/http/mod.rs
@@ -10,17 +10,15 @@ use anyhow::bail;
 use bytes::Bytes;
 use http_body_util::BodyExt;
 use hyper::body::Body;
+pub(crate) use reqwest::{Request, Response};
+use reqwest_middleware::RequestBuilder;
+pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error};
+pub(crate) use reqwest_retry::policies::ExponentialBackoff;
+pub(crate) use reqwest_retry::RetryTransientMiddleware;
 use serde::de::DeserializeOwned;
 
-pub(crate) use reqwest::{Request, Response};
-pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error};
-pub(crate) use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
-
-use crate::{
-    metrics::{ConsoleRequest, Metrics},
-    url::ApiUrl,
-};
-use reqwest_middleware::RequestBuilder;
+use crate::metrics::{ConsoleRequest, Metrics};
+use crate::url::ApiUrl;
 
 /// This is the preferred way to create new http clients,
 /// because it takes care of observability (OpenTelemetry).
@@ -142,9 +140,10 @@ pub(crate) async fn parse_json_body_with_limit<D: DeserializeOwned>(
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use reqwest::Client;
 
+    use super::*;
+
     #[test]
     fn optional_query_params() -> anyhow::Result<()> {
         let url = "http://example.com".parse()?;
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index 108420d7d7..09fd9657d0 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -1,6 +1,8 @@
-use std::{
-    hash::BuildHasherDefault, marker::PhantomData, num::NonZeroUsize, ops::Index, sync::OnceLock,
-};
+use std::hash::BuildHasherDefault;
+use std::marker::PhantomData;
+use std::num::NonZeroUsize;
+use std::ops::Index;
+use std::sync::OnceLock;
 
 use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo};
 use rustc_hash::FxHasher;
@@ -208,9 +210,8 @@ impl From<ProjectId> for ProjectIdInt {
 mod tests {
     use std::sync::OnceLock;
 
-    use crate::intern::StringInterner;
-
     use super::InternId;
+    use crate::intern::StringInterner;
 
     struct MyId;
     impl InternId for MyId {
@@ -222,7 +223,8 @@ mod tests {
 
     #[test]
     fn push_many_strings() {
-        use rand::{rngs::StdRng, Rng, SeedableRng};
+        use rand::rngs::StdRng;
+        use rand::{Rng, SeedableRng};
         use rand_distr::Zipf;
 
         let endpoint_dist = Zipf::new(500000, 0.8).unwrap();
diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs
index d307d80f4a..0fae78b60c 100644
--- a/proxy/src/jemalloc.rs
+++ b/proxy/src/jemalloc.rs
@@ -1,14 +1,12 @@
 use std::marker::PhantomData;
 
-use measured::{
-    label::NoLabels,
-    metric::{
-        gauge::GaugeState, group::Encoding, name::MetricNameEncoder, MetricEncoding,
-        MetricFamilyEncoding, MetricType,
-    },
-    text::TextEncoder,
-    LabelGroup, MetricGroup,
-};
+use measured::label::NoLabels;
+use measured::metric::gauge::GaugeState;
+use measured::metric::group::Encoding;
+use measured::metric::name::MetricNameEncoder;
+use measured::metric::{MetricEncoding, MetricFamilyEncoding, MetricType};
+use measured::text::TextEncoder;
+use measured::{LabelGroup, MetricGroup};
 use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version};
 
 pub struct MetricRecorder {
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index a34eb820f8..11921867e4 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -1,14 +1,10 @@
 use tracing::Subscriber;
-use tracing_subscriber::{
-    filter::{EnvFilter, LevelFilter},
-    fmt::{
-        format::{Format, Full},
-        time::SystemTime,
-        FormatEvent, FormatFields,
-    },
-    prelude::*,
-    registry::LookupSpan,
-};
+use tracing_subscriber::filter::{EnvFilter, LevelFilter};
+use tracing_subscriber::fmt::format::{Format, Full};
+use tracing_subscriber::fmt::time::SystemTime;
+use tracing_subscriber::fmt::{FormatEvent, FormatFields};
+use tracing_subscriber::prelude::*;
+use tracing_subscriber::registry::LookupSpan;
 
 /// Initialize logging and OpenTelemetry tracing and exporter.
 ///
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 272723a1bc..542826e833 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,14 +1,16 @@
 use std::sync::{Arc, OnceLock};
 
 use lasso::ThreadedRodeo;
+use measured::label::{
+    FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet,
+};
+use measured::metric::histogram::Thresholds;
+use measured::metric::name::MetricName;
 use measured::{
-    label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet},
-    metric::{histogram::Thresholds, name::MetricName},
     Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
     MetricGroup,
 };
 use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
-
 use tokio::time::{self, Instant};
 
 use crate::control_plane::messages::ColdStartInfo;
diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index 17764f78d1..ef2391cdd8 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -1,11 +1,9 @@
 //! Proxy Protocol V2 implementation
 
-use std::{
-    io,
-    net::SocketAddr,
-    pin::Pin,
-    task::{Context, Poll},
-};
+use std::io;
+use std::net::SocketAddr;
+use std::pin::Pin;
+use std::task::{Context, Poll};
 
 use bytes::BytesMut;
 use pin_project_lite::pin_project;
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index aac7720890..8e9663626a 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -1,24 +1,23 @@
-use crate::{
-    auth::backend::ComputeCredentialKeys,
-    compute::COULD_NOT_CONNECT,
-    compute::{self, PostgresConnection},
-    config::RetryConfig,
-    context::RequestMonitoring,
-    control_plane::{self, errors::WakeComputeError, locks::ApiLocks, CachedNodeInfo, NodeInfo},
-    error::ReportableError,
-    metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType},
-    proxy::{
-        retry::{retry_after, should_retry, CouldRetry},
-        wake_compute::wake_compute,
-    },
-    Host,
-};
 use async_trait::async_trait;
 use pq_proto::StartupMessageParams;
 use tokio::time;
 use tracing::{debug, info, warn};
 
 use super::retry::ShouldRetryWakeCompute;
+use crate::auth::backend::ComputeCredentialKeys;
+use crate::compute::{self, PostgresConnection, COULD_NOT_CONNECT};
+use crate::config::RetryConfig;
+use crate::context::RequestMonitoring;
+use crate::control_plane::errors::WakeComputeError;
+use crate::control_plane::locks::ApiLocks;
+use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
+use crate::error::ReportableError;
+use crate::metrics::{
+    ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType,
+};
+use crate::proxy::retry::{retry_after, should_retry, CouldRetry};
+use crate::proxy::wake_compute::wake_compute;
+use crate::Host;
 
 const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
 
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index 4ebda013ac..91a3ceff75 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -1,11 +1,11 @@
-use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
-use tracing::info;
-
 use std::future::poll_fn;
 use std::io;
 use std::pin::Pin;
 use std::task::{ready, Context, Poll};
 
+use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
+use tracing::info;
+
 #[derive(Debug)]
 enum TransferState {
     Running(CopyBuffer),
@@ -256,9 +256,10 @@ impl CopyBuffer {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use tokio::io::AsyncWriteExt;
 
+    use super::*;
+
     #[tokio::test]
     async fn test_client_to_compute() {
         let (mut client_client, mut client_proxy) = tokio::io::duplex(8); // Create a mock duplex stream
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index 5996b11c11..a67f1b8112 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -1,21 +1,19 @@
 use bytes::Buf;
+use pq_proto::framed::Framed;
 use pq_proto::{
-    framed::Framed, BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion,
-    StartupMessageParams,
+    BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion, StartupMessageParams,
 };
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
 
-use crate::{
-    auth::endpoint_sni,
-    config::{TlsConfig, PG_ALPN_PROTOCOL},
-    context::RequestMonitoring,
-    error::ReportableError,
-    metrics::Metrics,
-    proxy::ERR_INSECURE_CONNECTION,
-    stream::{PqStream, Stream, StreamUpgradeError},
-};
+use crate::auth::endpoint_sni;
+use crate::config::{TlsConfig, PG_ALPN_PROTOCOL};
+use crate::context::RequestMonitoring;
+use crate::error::ReportableError;
+use crate::metrics::Metrics;
+use crate::proxy::ERR_INSECURE_CONNECTION;
+use crate::stream::{PqStream, Stream, StreamUpgradeError};
 
 #[derive(Error, Debug)]
 pub(crate) enum HandshakeError {
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index b2b5a7f43d..f646862caa 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -7,40 +7,32 @@ pub(crate) mod handshake;
 pub(crate) mod passthrough;
 pub(crate) mod retry;
 pub(crate) mod wake_compute;
-pub use copy_bidirectional::copy_bidirectional_client_compute;
-pub use copy_bidirectional::ErrorSource;
+use std::sync::Arc;
 
-use crate::config::ProxyProtocolV2;
-use crate::{
-    auth,
-    cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal},
-    compute,
-    config::{ProxyConfig, TlsConfig},
-    context::RequestMonitoring,
-    error::ReportableError,
-    metrics::{Metrics, NumClientConnectionsGuard},
-    protocol2::read_proxy_protocol,
-    proxy::handshake::{handshake, HandshakeData},
-    rate_limiter::EndpointRateLimiter,
-    stream::{PqStream, Stream},
-    EndpointCacheKey,
-};
+pub use copy_bidirectional::{copy_bidirectional_client_compute, ErrorSource};
 use futures::TryFutureExt;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pq_proto::{BeMessage as Be, StartupMessageParams};
 use regex::Regex;
 use smol_str::{format_smolstr, SmolStr};
-use std::sync::Arc;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn, Instrument};
 
-use self::{
-    connect_compute::{connect_to_compute, TcpMechanism},
-    passthrough::ProxyPassthrough,
-};
+use self::connect_compute::{connect_to_compute, TcpMechanism};
+use self::passthrough::ProxyPassthrough;
+use crate::cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal};
+use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
+use crate::context::RequestMonitoring;
+use crate::error::ReportableError;
+use crate::metrics::{Metrics, NumClientConnectionsGuard};
+use crate::protocol2::read_proxy_protocol;
+use crate::proxy::handshake::{handshake, HandshakeData};
+use crate::rate_limiter::EndpointRateLimiter;
+use crate::stream::{PqStream, Stream};
+use crate::{auth, compute, EndpointCacheKey};
 
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index 497cf4bfd5..e3b4730982 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -1,16 +1,14 @@
-use crate::{
-    cancellation,
-    compute::PostgresConnection,
-    control_plane::messages::MetricsAuxInfo,
-    metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard},
-    stream::Stream,
-    usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS},
-};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 use utils::measured_stream::MeasuredStream;
 
 use super::copy_bidirectional::ErrorSource;
+use crate::cancellation;
+use crate::compute::PostgresConnection;
+use crate::control_plane::messages::MetricsAuxInfo;
+use crate::metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard};
+use crate::stream::Stream;
+use crate::usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS};
 
 /// Forward bytes in both directions (client <-> compute).
 #[tracing::instrument(skip_all)]
diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs
index 15895d37e6..d3f0c3e7d4 100644
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -1,7 +1,11 @@
-use crate::{compute, config::RetryConfig};
-use std::{error::Error, io};
+use std::error::Error;
+use std::io;
+
 use tokio::time;
 
+use crate::compute;
+use crate::config::RetryConfig;
+
 pub(crate) trait CouldRetry {
     /// Returns true if the error could be retried
     fn could_retry(&self) -> bool;
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index 33a2162bc7..df9f79a7e3 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -6,7 +6,6 @@
 
 use std::fmt::Debug;
 
-use super::*;
 use bytes::{Bytes, BytesMut};
 use futures::{SinkExt, StreamExt};
 use postgres_protocol::message::frontend;
@@ -14,6 +13,8 @@ use tokio::io::{AsyncReadExt, DuplexStream};
 use tokio_postgres::tls::TlsConnect;
 use tokio_util::codec::{Decoder, Encoder};
 
+use super::*;
+
 enum Intercept {
     None,
     Methods,
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index deb4d4a63f..e50ae4bc93 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -4,6 +4,16 @@ mod mitm;
 
 use std::time::Duration;
 
+use anyhow::{bail, Context};
+use async_trait::async_trait;
+use http::StatusCode;
+use retry::{retry_after, ShouldRetryWakeCompute};
+use rstest::rstest;
+use rustls::pki_types;
+use tokio_postgres::config::SslMode;
+use tokio_postgres::tls::{MakeTlsConnect, NoTls};
+use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream};
+
 use super::connect_compute::ConnectMechanism;
 use super::retry::CouldRetry;
 use super::*;
@@ -18,15 +28,6 @@ use crate::control_plane::provider::{
 use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
 use crate::{sasl, scram, BranchId, EndpointId, ProjectId};
-use anyhow::{bail, Context};
-use async_trait::async_trait;
-use http::StatusCode;
-use retry::{retry_after, ShouldRetryWakeCompute};
-use rstest::rstest;
-use rustls::pki_types;
-use tokio_postgres::config::SslMode;
-use tokio_postgres::tls::{MakeTlsConnect, NoTls};
-use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream};
 
 /// Generate a set of TLS certificates: CA + server.
 fn generate_certs(
@@ -336,7 +337,8 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
         generate_tls_config("generic-project-name.localhost", "localhost")?;
     let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), Scram::mock()));
 
-    use rand::{distributions::Alphanumeric, Rng};
+    use rand::distributions::Alphanumeric;
+    use rand::Rng;
     let password: String = rand::thread_rng()
         .sample_iter(&Alphanumeric)
         .take(rand::random::<u8>() as usize)
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index 0d1527a2c1..9dfa485fa4 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,16 +1,17 @@
+use hyper::StatusCode;
+use tracing::{error, info, warn};
+
+use super::connect_compute::ComputeConnectBackend;
 use crate::config::RetryConfig;
 use crate::context::RequestMonitoring;
+use crate::control_plane::errors::WakeComputeError;
 use crate::control_plane::messages::{ControlPlaneError, Reason};
-use crate::control_plane::{errors::WakeComputeError, provider::CachedNodeInfo};
+use crate::control_plane::provider::CachedNodeInfo;
 use crate::metrics::{
     ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType,
     WakeupFailureKind,
 };
 use crate::proxy::retry::{retry_after, should_retry};
-use hyper::StatusCode;
-use tracing::{error, info, warn};
-
-use super::connect_compute::ComputeConnectBackend;
 
 pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
     num_retries: &mut u32,
diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs
index bf4d85f2e4..45f9630dde 100644
--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -1,7 +1,5 @@
-use std::{
-    hash::Hash,
-    sync::atomic::{AtomicUsize, Ordering},
-};
+use std::hash::Hash;
+use std::sync::atomic::{AtomicUsize, Ordering};
 
 use ahash::RandomState;
 use dashmap::DashMap;
diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs
index 25607b7e10..16c398f303 100644
--- a/proxy/src/rate_limiter/limit_algorithm.rs
+++ b/proxy/src/rate_limiter/limit_algorithm.rs
@@ -1,10 +1,12 @@
 //! Algorithms for controlling concurrency limits.
+use std::pin::pin;
+use std::sync::Arc;
+use std::time::Duration;
+
 use parking_lot::Mutex;
-use std::{pin::pin, sync::Arc, time::Duration};
-use tokio::{
-    sync::Notify,
-    time::{error::Elapsed, Instant},
-};
+use tokio::sync::Notify;
+use tokio::time::error::Elapsed;
+use tokio::time::Instant;
 
 use self::aimd::Aimd;
 
diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
index 86b56e38fb..5332a5184f 100644
--- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs
+++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
@@ -60,12 +60,11 @@ impl LimitAlgorithm for Aimd {
 mod tests {
     use std::time::Duration;
 
+    use super::*;
     use crate::rate_limiter::limit_algorithm::{
         DynamicLimiter, RateLimitAlgorithm, RateLimiterConfig,
     };
 
-    use super::*;
-
     #[tokio::test(start_paused = true)]
     async fn increase_decrease() {
         let config = RateLimiterConfig {
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index be529f174d..5de64c2254 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -1,17 +1,14 @@
-use std::{
-    borrow::Cow,
-    collections::hash_map::RandomState,
-    hash::{BuildHasher, Hash},
-    sync::{
-        atomic::{AtomicUsize, Ordering},
-        Mutex,
-    },
-};
+use std::borrow::Cow;
+use std::collections::hash_map::RandomState;
+use std::hash::{BuildHasher, Hash};
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::Mutex;
 
 use anyhow::bail;
 use dashmap::DashMap;
 use itertools::Itertools;
-use rand::{rngs::StdRng, Rng, SeedableRng};
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
 use tokio::time::{Duration, Instant};
 use tracing::info;
 
@@ -243,14 +240,17 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
 
 #[cfg(test)]
 mod tests {
-    use std::{hash::BuildHasherDefault, time::Duration};
+    use std::hash::BuildHasherDefault;
+    use std::time::Duration;
 
     use rand::SeedableRng;
     use rustc_hash::FxHasher;
     use tokio::time;
 
     use super::{BucketRateLimiter, WakeComputeRateLimiter};
-    use crate::{intern::EndpointIdInt, rate_limiter::RateBucketInfo, EndpointId};
+    use crate::intern::EndpointIdInt;
+    use crate::rate_limiter::RateBucketInfo;
+    use crate::EndpointId;
 
     #[test]
     fn rate_bucket_rpi() {
diff --git a/proxy/src/rate_limiter/mod.rs b/proxy/src/rate_limiter/mod.rs
index 6e38f89458..3ae2ecaf8f 100644
--- a/proxy/src/rate_limiter/mod.rs
+++ b/proxy/src/rate_limiter/mod.rs
@@ -2,13 +2,11 @@ mod leaky_bucket;
 mod limit_algorithm;
 mod limiter;
 
+pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter};
 #[cfg(test)]
 pub(crate) use limit_algorithm::aimd::Aimd;
-
 pub(crate) use limit_algorithm::{
     DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token,
 };
 pub(crate) use limiter::GlobalRateLimiter;
-
-pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter};
 pub use limiter::{BucketRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
index 95bdfc0965..0000246971 100644
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -5,13 +5,10 @@ use redis::AsyncCommands;
 use tokio::sync::Mutex;
 use uuid::Uuid;
 
+use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
+use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME};
 use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
 
-use super::{
-    connection_with_credentials_provider::ConnectionWithCredentialsProvider,
-    notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME},
-};
-
 pub trait CancellationPublisherMut: Send + Sync + 'static {
     #[allow(async_fn_in_trait)]
     async fn try_publish(
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index ccd48f1481..82139ea1d5 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -1,10 +1,9 @@
-use std::{sync::Arc, time::Duration};
+use std::sync::Arc;
+use std::time::Duration;
 
 use futures::FutureExt;
-use redis::{
-    aio::{ConnectionLike, MultiplexedConnection},
-    ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult,
-};
+use redis::aio::{ConnectionLike, MultiplexedConnection};
+use redis::{ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult};
 use tokio::task::JoinHandle;
 use tracing::{debug, error, info, warn};
 
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index c3af6740cb..e56c5a3414 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -1,4 +1,5 @@
-use std::{convert::Infallible, sync::Arc};
+use std::convert::Infallible;
+use std::sync::Arc;
 
 use futures::StreamExt;
 use pq_proto::CancelKeyData;
@@ -8,12 +9,10 @@ use tokio_util::sync::CancellationToken;
 use uuid::Uuid;
 
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
-use crate::{
-    cache::project_info::ProjectInfoCache,
-    cancellation::{CancelMap, CancellationHandler},
-    intern::{ProjectIdInt, RoleNameInt},
-    metrics::{Metrics, RedisErrors, RedisEventsCount},
-};
+use crate::cache::project_info::ProjectInfoCache;
+use crate::cancellation::{CancelMap, CancellationHandler};
+use crate::intern::{ProjectIdInt, RoleNameInt};
+use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
 
 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
 pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates";
@@ -269,10 +268,10 @@ where
 
 #[cfg(test)]
 mod tests {
-    use crate::{ProjectId, RoleName};
+    use serde_json::json;
 
     use super::*;
-    use serde_json::json;
+    use crate::{ProjectId, RoleName};
 
     #[test]
     fn parse_allowed_ips() -> anyhow::Result<()> {
diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs
index 6c9a42b2db..1373dfba3d 100644
--- a/proxy/src/sasl/messages.rs
+++ b/proxy/src/sasl/messages.rs
@@ -1,8 +1,9 @@
 //! Definitions for SASL messages.
 
-use crate::parse::{split_at_const, split_cstr};
 use pq_proto::{BeAuthenticationSaslMessage, BeMessage};
 
+use crate::parse::{split_at_const, split_cstr};
+
 /// SASL-specific payload of [`PasswordMessage`](pq_proto::FeMessage::PasswordMessage).
 #[derive(Debug)]
 pub(crate) struct FirstMessage<'a> {
diff --git a/proxy/src/sasl/mod.rs b/proxy/src/sasl/mod.rs
index 0a36694359..f0181b404f 100644
--- a/proxy/src/sasl/mod.rs
+++ b/proxy/src/sasl/mod.rs
@@ -10,13 +10,14 @@ mod channel_binding;
 mod messages;
 mod stream;
 
-use crate::error::{ReportableError, UserFacingError};
 use std::io;
-use thiserror::Error;
 
 pub(crate) use channel_binding::ChannelBinding;
 pub(crate) use messages::FirstMessage;
 pub(crate) use stream::{Outcome, SaslStream};
+use thiserror::Error;
+
+use crate::error::{ReportableError, UserFacingError};
 
 /// Fine-grained auth errors help in writing tests.
 #[derive(Error, Debug)]
diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs
index b6becd28e1..f1c916daa2 100644
--- a/proxy/src/sasl/stream.rs
+++ b/proxy/src/sasl/stream.rs
@@ -1,11 +1,14 @@
 //! Abstraction for the string-oriented SASL protocols.
 
-use super::{messages::ServerMessage, Mechanism};
-use crate::stream::PqStream;
 use std::io;
+
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 
+use super::messages::ServerMessage;
+use super::Mechanism;
+use crate::stream::PqStream;
+
 /// Abstracts away all peculiarities of the libpq's protocol.
 pub(crate) struct SaslStream<'a, S> {
     /// The underlying stream.
diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs
index 64ee0135e1..87ab6e0d5f 100644
--- a/proxy/src/scram/countmin.rs
+++ b/proxy/src/scram/countmin.rs
@@ -69,7 +69,9 @@ impl CountMinSketch {
 
 #[cfg(test)]
 mod tests {
-    use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng};
+    use rand::rngs::StdRng;
+    use rand::seq::SliceRandom;
+    use rand::{Rng, SeedableRng};
 
     use super::CountMinSketch;
 
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index afb5604666..493295c938 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -209,7 +209,8 @@ impl sasl::Mechanism for Exchange<'_> {
     type Output = super::ScramKey;
 
     fn exchange(mut self, input: &str) -> sasl::Result<sasl::Step<Self, Self::Output>> {
-        use {sasl::Step, ExchangeState};
+        use sasl::Step;
+        use ExchangeState;
         match &self.state {
             ExchangeState::Initial(init) => {
                 match init.transition(self.secret, &self.tls_server_end_point, input)? {
diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs
index fd9e77764c..5ee3a51352 100644
--- a/proxy/src/scram/messages.rs
+++ b/proxy/src/scram/messages.rs
@@ -1,11 +1,12 @@
 //! Definitions for SCRAM messages.
 
+use std::fmt;
+use std::ops::Range;
+
 use super::base64_decode_array;
 use super::key::{ScramKey, SCRAM_KEY_LEN};
 use super::signature::SignatureBuilder;
 use crate::sasl::ChannelBinding;
-use std::fmt;
-use std::ops::Range;
 
 /// Faithfully taken from PostgreSQL.
 pub(crate) const SCRAM_RAW_NONCE_LEN: usize = 18;
diff --git a/proxy/src/scram/mod.rs b/proxy/src/scram/mod.rs
index d058f1c3f8..97644b6282 100644
--- a/proxy/src/scram/mod.rs
+++ b/proxy/src/scram/mod.rs
@@ -16,10 +16,9 @@ mod signature;
 pub mod threadpool;
 
 pub(crate) use exchange::{exchange, Exchange};
+use hmac::{Hmac, Mac};
 pub(crate) use key::ScramKey;
 pub(crate) use secret::ServerSecret;
-
-use hmac::{Hmac, Mac};
 use sha2::{Digest, Sha256};
 
 const SCRAM_SHA_256: &str = "SCRAM-SHA-256";
@@ -59,13 +58,11 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
 
 #[cfg(test)]
 mod tests {
-    use crate::{
-        intern::EndpointIdInt,
-        sasl::{Mechanism, Step},
-        EndpointId,
-    };
-
-    use super::{threadpool::ThreadPool, Exchange, ServerSecret};
+    use super::threadpool::ThreadPool;
+    use super::{Exchange, ServerSecret};
+    use crate::intern::EndpointIdInt;
+    use crate::sasl::{Mechanism, Step};
+    use crate::EndpointId;
 
     #[test]
     fn snapshot() {
diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs
index 4cf76c8452..9c559e9082 100644
--- a/proxy/src/scram/pbkdf2.rs
+++ b/proxy/src/scram/pbkdf2.rs
@@ -1,7 +1,6 @@
-use hmac::{
-    digest::{consts::U32, generic_array::GenericArray},
-    Hmac, Mac,
-};
+use hmac::digest::consts::U32;
+use hmac::digest::generic_array::GenericArray;
+use hmac::{Hmac, Mac};
 use sha2::Sha256;
 
 pub(crate) struct Pbkdf2 {
@@ -66,10 +65,11 @@ impl Pbkdf2 {
 
 #[cfg(test)]
 mod tests {
-    use super::Pbkdf2;
     use pbkdf2::pbkdf2_hmac_array;
     use sha2::Sha256;
 
+    use super::Pbkdf2;
+
     #[test]
     fn works() {
         let salt = b"sodium chloride";
diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs
index c027a0cd20..cc1b69fcf9 100644
--- a/proxy/src/scram/threadpool.rs
+++ b/proxy/src/scram/threadpool.rs
@@ -4,28 +4,21 @@
 //! 1. Fairness per endpoint.
 //! 2. Yield support for high iteration counts.
 
-use std::{
-    cell::RefCell,
-    future::Future,
-    pin::Pin,
-    sync::{
-        atomic::{AtomicUsize, Ordering},
-        Arc, Weak,
-    },
-    task::{Context, Poll},
-};
+use std::cell::RefCell;
+use std::future::Future;
+use std::pin::Pin;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::{Arc, Weak};
+use std::task::{Context, Poll};
 
 use futures::FutureExt;
-use rand::Rng;
-use rand::{rngs::SmallRng, SeedableRng};
-
-use crate::{
-    intern::EndpointIdInt,
-    metrics::{ThreadPoolMetrics, ThreadPoolWorkerId},
-    scram::countmin::CountMinSketch,
-};
+use rand::rngs::SmallRng;
+use rand::{Rng, SeedableRng};
 
 use super::pbkdf2::Pbkdf2;
+use crate::intern::EndpointIdInt;
+use crate::metrics::{ThreadPoolMetrics, ThreadPoolWorkerId};
+use crate::scram::countmin::CountMinSketch;
 
 pub struct ThreadPool {
     runtime: Option<tokio::runtime::Runtime>,
@@ -195,9 +188,8 @@ impl Drop for JobHandle {
 
 #[cfg(test)]
 mod tests {
-    use crate::EndpointId;
-
     use super::*;
+    use crate::EndpointId;
 
     #[tokio::test]
     async fn hash_is_correct() {
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 927854897f..a180c4c2ed 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -1,42 +1,34 @@
-use std::{io, sync::Arc, time::Duration};
+use std::io;
+use std::sync::Arc;
+use std::time::Duration;
 
 use async_trait::async_trait;
 use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer};
-use p256::{ecdsa::SigningKey, elliptic_curve::JwkEcKey};
+use p256::ecdsa::SigningKey;
+use p256::elliptic_curve::JwkEcKey;
 use rand::rngs::OsRng;
 use tokio::net::{lookup_host, TcpStream};
-use tracing::{debug, field::display, info};
+use tracing::field::display;
+use tracing::{debug, info};
 
-use crate::{
-    auth::{
-        self,
-        backend::{local::StaticAuthRules, ComputeCredentials, ComputeUserInfo},
-        check_peer_addr_is_in_list, AuthError,
-    },
-    compute,
-    config::ProxyConfig,
-    context::RequestMonitoring,
-    control_plane::{
-        errors::{GetAuthInfoError, WakeComputeError},
-        locks::ApiLocks,
-        provider::ApiLockError,
-        CachedNodeInfo,
-    },
-    error::{ErrorKind, ReportableError, UserFacingError},
-    intern::EndpointIdInt,
-    proxy::{
-        connect_compute::ConnectMechanism,
-        retry::{CouldRetry, ShouldRetryWakeCompute},
-    },
-    rate_limiter::EndpointRateLimiter,
-    EndpointId, Host,
-};
-
-use super::{
-    conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool},
-    http_conn_pool::{self, poll_http2_client},
-    local_conn_pool::{self, LocalClient, LocalConnPool},
-};
+use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
+use super::http_conn_pool::{self, poll_http2_client};
+use super::local_conn_pool::{self, LocalClient, LocalConnPool};
+use crate::auth::backend::local::StaticAuthRules;
+use crate::auth::backend::{ComputeCredentials, ComputeUserInfo};
+use crate::auth::{self, check_peer_addr_is_in_list, AuthError};
+use crate::config::ProxyConfig;
+use crate::context::RequestMonitoring;
+use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError};
+use crate::control_plane::locks::ApiLocks;
+use crate::control_plane::provider::ApiLockError;
+use crate::control_plane::CachedNodeInfo;
+use crate::error::{ErrorKind, ReportableError, UserFacingError};
+use crate::intern::EndpointIdInt;
+use crate::proxy::connect_compute::ConnectMechanism;
+use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute};
+use crate::rate_limiter::EndpointRateLimiter;
+use crate::{compute, EndpointId, Host};
 
 pub(crate) struct PoolingBackend {
     pub(crate) http_conn_pool: Arc<super::http_conn_pool::GlobalConnPool>,
diff --git a/proxy/src/serverless/cancel_set.rs b/proxy/src/serverless/cancel_set.rs
index 7659745473..6db986f1f7 100644
--- a/proxy/src/serverless/cancel_set.rs
+++ b/proxy/src/serverless/cancel_set.rs
@@ -1,10 +1,8 @@
 //! A set for cancelling random http connections
 
-use std::{
-    hash::{BuildHasher, BuildHasherDefault},
-    num::NonZeroUsize,
-    time::Duration,
-};
+use std::hash::{BuildHasher, BuildHasherDefault};
+use std::num::NonZeroUsize;
+use std::time::Duration;
 
 use indexmap::IndexMap;
 use parking_lot::Mutex;
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 2e576e0ded..aa869ff1c0 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -1,33 +1,31 @@
+use std::collections::HashMap;
+use std::fmt;
+use std::ops::Deref;
+use std::pin::pin;
+use std::sync::atomic::{self, AtomicUsize};
+use std::sync::{Arc, Weak};
+use std::task::{ready, Poll};
+use std::time::Duration;
+
 use dashmap::DashMap;
-use futures::{future::poll_fn, Future};
+use futures::future::poll_fn;
+use futures::Future;
 use parking_lot::RwLock;
 use rand::Rng;
 use smallvec::SmallVec;
-use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration};
-use std::{
-    fmt,
-    task::{ready, Poll},
-};
-use std::{
-    ops::Deref,
-    sync::atomic::{self, AtomicUsize},
-};
 use tokio::time::Instant;
 use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
 use tokio_util::sync::CancellationToken;
+use tracing::{debug, error, info, info_span, warn, Instrument, Span};
 
+use super::backend::HttpConnError;
+use crate::auth::backend::ComputeUserInfo;
+use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-use crate::{
-    auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName,
-};
-
-use tracing::{debug, error, warn, Span};
-use tracing::{info, info_span, Instrument};
-
-use super::backend::HttpConnError;
+use crate::{DbName, EndpointCacheKey, RoleName};
 
 #[derive(Debug, Clone)]
 pub(crate) struct ConnInfoWithAuth {
@@ -724,13 +722,13 @@ impl<C: ClientInnerExt> Drop for Client<C> {
 
 #[cfg(test)]
 mod tests {
-    use std::{mem, sync::atomic::AtomicBool};
-
-    use crate::{
-        proxy::NeonOptions, serverless::cancel_set::CancelSet, BranchId, EndpointId, ProjectId,
-    };
+    use std::mem;
+    use std::sync::atomic::AtomicBool;
 
     use super::*;
+    use crate::proxy::NeonOptions;
+    use crate::serverless::cancel_set::CancelSet;
+    use crate::{BranchId, EndpointId, ProjectId};
 
     struct MockClient(Arc<AtomicBool>);
     impl MockClient {
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index 6d61536f1a..9b6bc98557 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -1,22 +1,21 @@
+use std::collections::VecDeque;
+use std::sync::atomic::{self, AtomicUsize};
+use std::sync::{Arc, Weak};
+
 use dashmap::DashMap;
 use hyper::client::conn::http2;
 use hyper_util::rt::{TokioExecutor, TokioIo};
 use parking_lot::RwLock;
 use rand::Rng;
-use std::collections::VecDeque;
-use std::sync::atomic::{self, AtomicUsize};
-use std::{sync::Arc, sync::Weak};
 use tokio::net::TcpStream;
+use tracing::{debug, error, info, info_span, Instrument};
 
+use super::conn_pool::ConnInfo;
+use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-use crate::{context::RequestMonitoring, EndpointCacheKey};
-
-use tracing::{debug, error};
-use tracing::{info, info_span, Instrument};
-
-use super::conn_pool::ConnInfo;
+use crate::EndpointCacheKey;
 
 pub(crate) type Send = http2::SendRequest<hyper::body::Incoming>;
 pub(crate) type Connect =
diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs
index c1c5764d17..c0208d4f68 100644
--- a/proxy/src/serverless/http_util.rs
+++ b/proxy/src/serverless/http_util.rs
@@ -1,12 +1,11 @@
 //! Things stolen from `libs/utils/src/http` to add hyper 1.0 compatibility
 //! Will merge back in at some point in the future.
 
-use bytes::Bytes;
-
 use anyhow::Context;
+use bytes::Bytes;
 use http::{Response, StatusCode};
-use http_body_util::{combinators::BoxBody, BodyExt, Full};
-
+use http_body_util::combinators::BoxBody;
+use http_body_util::{BodyExt, Full};
 use serde::Serialize;
 use utils::http::error::ApiError;
 
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index 9f328a0e1d..8c56d317cc 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -1,7 +1,5 @@
-use serde_json::Map;
-use serde_json::Value;
-use tokio_postgres::types::Kind;
-use tokio_postgres::types::Type;
+use serde_json::{Map, Value};
+use tokio_postgres::types::{Kind, Type};
 use tokio_postgres::Row;
 
 //
@@ -256,9 +254,10 @@ fn _pg_array_parse(
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use serde_json::json;
 
+    use super::*;
+
     #[test]
     fn test_atomic_types_to_pg_params() {
         let json = vec![Value::Bool(true), Value::Bool(false)];
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index 4ab14ad35f..5df37a8762 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -1,28 +1,31 @@
-use futures::{future::poll_fn, Future};
+use std::collections::HashMap;
+use std::pin::pin;
+use std::sync::{Arc, Weak};
+use std::task::{ready, Poll};
+use std::time::Duration;
+
+use futures::future::poll_fn;
+use futures::Future;
 use indexmap::IndexMap;
 use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding};
 use p256::ecdsa::{Signature, SigningKey};
 use parking_lot::RwLock;
 use serde_json::value::RawValue;
 use signature::Signer;
-use std::task::{ready, Poll};
-use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration};
 use tokio::time::Instant;
 use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::types::ToSql;
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
 use tokio_util::sync::CancellationToken;
-
-use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
-use crate::metrics::Metrics;
-use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-use crate::{context::RequestMonitoring, DbName, RoleName};
-
-use tracing::{error, warn, Span};
-use tracing::{info, info_span, Instrument};
+use tracing::{error, info, info_span, warn, Instrument, Span};
 
 use super::backend::HttpConnError;
 use super::conn_pool::{ClientInnerExt, ConnInfo};
+use crate::context::RequestMonitoring;
+use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
+use crate::metrics::Metrics;
+use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
+use crate::{DbName, RoleName};
 
 struct ConnPoolEntry<C: ClientInnerExt> {
     conn: ClientInner<C>,
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index 3131adada4..3ed3b6c845 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -12,12 +12,15 @@ mod local_conn_pool;
 mod sql_over_http;
 mod websocket;
 
+use std::net::{IpAddr, SocketAddr};
+use std::pin::{pin, Pin};
+use std::sync::Arc;
+
+use anyhow::Context;
 use async_trait::async_trait;
 use atomic_take::AtomicTake;
 use bytes::Bytes;
 pub use conn_pool::GlobalConnPoolOptions;
-
-use anyhow::Context;
 use futures::future::{select, Either};
 use futures::TryFutureExt;
 use http::{Method, Response, StatusCode};
@@ -29,9 +32,13 @@ use hyper_util::server::conn::auto::Builder;
 use rand::rngs::StdRng;
 use rand::SeedableRng;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::net::{TcpListener, TcpStream};
 use tokio::time::timeout;
 use tokio_rustls::TlsAcceptor;
+use tokio_util::sync::CancellationToken;
 use tokio_util::task::TaskTracker;
+use tracing::{info, warn, Instrument};
+use utils::http::error::ApiError;
 
 use crate::cancellation::CancellationHandlerMain;
 use crate::config::ProxyConfig;
@@ -43,14 +50,6 @@ use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
 use crate::serverless::http_util::{api_error_into_response, json_response};
 
-use std::net::{IpAddr, SocketAddr};
-use std::pin::{pin, Pin};
-use std::sync::Arc;
-use tokio::net::{TcpListener, TcpStream};
-use tokio_util::sync::CancellationToken;
-use tracing::{info, warn, Instrument};
-use utils::http::error::ApiError;
-
 pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api";
 
 pub async fn task_main(
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index cf3324926c..3d8a2adef1 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -2,77 +2,43 @@ use std::pin::pin;
 use std::sync::Arc;
 
 use bytes::Bytes;
-use futures::future::select;
-use futures::future::try_join;
-use futures::future::Either;
-use futures::StreamExt;
-use futures::TryFutureExt;
+use futures::future::{select, try_join, Either};
+use futures::{StreamExt, TryFutureExt};
 use http::header::AUTHORIZATION;
 use http::Method;
 use http_body_util::combinators::BoxBody;
-use http_body_util::BodyExt;
-use http_body_util::Full;
-use hyper::body::Body;
-use hyper::body::Incoming;
-use hyper::header;
-use hyper::http::HeaderName;
-use hyper::http::HeaderValue;
-use hyper::Response;
-use hyper::StatusCode;
-use hyper::{HeaderMap, Request};
+use http_body_util::{BodyExt, Full};
+use hyper::body::{Body, Incoming};
+use hyper::http::{HeaderName, HeaderValue};
+use hyper::{header, HeaderMap, Request, Response, StatusCode};
 use pq_proto::StartupMessageParamsBuilder;
 use serde::Serialize;
 use serde_json::Value;
 use tokio::time;
-use tokio_postgres::error::DbError;
-use tokio_postgres::error::ErrorPosition;
-use tokio_postgres::error::SqlState;
-use tokio_postgres::GenericClient;
-use tokio_postgres::IsolationLevel;
-use tokio_postgres::NoTls;
-use tokio_postgres::ReadyForQueryStatus;
-use tokio_postgres::Transaction;
+use tokio_postgres::error::{DbError, ErrorPosition, SqlState};
+use tokio_postgres::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction};
 use tokio_util::sync::CancellationToken;
-use tracing::error;
-use tracing::info;
+use tracing::{error, info};
 use typed_json::json;
 use url::Url;
 use urlencoding;
 use utils::http::error::ApiError;
 
-use crate::auth::backend::ComputeCredentialKeys;
-use crate::auth::backend::ComputeUserInfo;
-use crate::auth::endpoint_sni;
-use crate::auth::ComputeUserInfoParseError;
-use crate::config::AuthenticationConfig;
-use crate::config::HttpConfig;
-use crate::config::ProxyConfig;
-use crate::config::TlsConfig;
-use crate::context::RequestMonitoring;
-use crate::error::ErrorKind;
-use crate::error::ReportableError;
-use crate::error::UserFacingError;
-use crate::metrics::HttpDirection;
-use crate::metrics::Metrics;
-use crate::proxy::run_until_cancelled;
-use crate::proxy::NeonOptions;
-use crate::serverless::backend::HttpConnError;
-use crate::usage_metrics::MetricCounter;
-use crate::usage_metrics::MetricCounterRecorder;
-use crate::DbName;
-use crate::RoleName;
-
-use super::backend::LocalProxyConnError;
-use super::backend::PoolingBackend;
-use super::conn_pool;
-use super::conn_pool::AuthData;
-use super::conn_pool::ConnInfo;
-use super::conn_pool::ConnInfoWithAuth;
+use super::backend::{LocalProxyConnError, PoolingBackend};
+use super::conn_pool::{AuthData, ConnInfo, ConnInfoWithAuth};
 use super::http_util::json_response;
-use super::json::json_to_pg_text;
-use super::json::pg_text_row_to_json;
-use super::json::JsonConversionError;
-use super::local_conn_pool;
+use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError};
+use super::{conn_pool, local_conn_pool};
+use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
+use crate::auth::{endpoint_sni, ComputeUserInfoParseError};
+use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig};
+use crate::context::RequestMonitoring;
+use crate::error::{ErrorKind, ReportableError, UserFacingError};
+use crate::metrics::{HttpDirection, Metrics};
+use crate::proxy::{run_until_cancelled, NeonOptions};
+use crate::serverless::backend::HttpConnError;
+use crate::usage_metrics::{MetricCounter, MetricCounterRecorder};
+use crate::{DbName, RoleName};
 
 #[derive(serde::Deserialize)]
 #[serde(rename_all = "camelCase")]
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index f5a692cf40..ba36116c2c 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -1,13 +1,7 @@
-use crate::proxy::ErrorSource;
-use crate::{
-    cancellation::CancellationHandlerMain,
-    config::ProxyConfig,
-    context::RequestMonitoring,
-    error::{io_error, ReportableError},
-    metrics::Metrics,
-    proxy::{handle_client, ClientMode},
-    rate_limiter::EndpointRateLimiter,
-};
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{ready, Context, Poll};
+
 use anyhow::Context as _;
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use framed_websockets::{Frame, OpCode, WebSocketServer};
@@ -15,15 +9,17 @@ use futures::{Sink, Stream};
 use hyper::upgrade::OnUpgrade;
 use hyper_util::rt::TokioIo;
 use pin_project_lite::pin_project;
-
-use std::{
-    pin::Pin,
-    sync::Arc,
-    task::{ready, Context, Poll},
-};
 use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
 use tracing::warn;
 
+use crate::cancellation::CancellationHandlerMain;
+use crate::config::ProxyConfig;
+use crate::context::RequestMonitoring;
+use crate::error::{io_error, ReportableError};
+use crate::metrics::Metrics;
+use crate::proxy::{handle_client, ClientMode, ErrorSource};
+use crate::rate_limiter::EndpointRateLimiter;
+
 pin_project! {
     /// This is a wrapper around a [`WebSocketStream`] that
     /// implements [`AsyncRead`] and [`AsyncWrite`].
@@ -184,14 +180,11 @@ mod tests {
 
     use framed_websockets::WebSocketServer;
     use futures::{SinkExt, StreamExt};
-    use tokio::{
-        io::{duplex, AsyncReadExt, AsyncWriteExt},
-        task::JoinSet,
-    };
-    use tokio_tungstenite::{
-        tungstenite::{protocol::Role, Message},
-        WebSocketStream,
-    };
+    use tokio::io::{duplex, AsyncReadExt, AsyncWriteExt};
+    use tokio::task::JoinSet;
+    use tokio_tungstenite::tungstenite::protocol::Role;
+    use tokio_tungstenite::tungstenite::Message;
+    use tokio_tungstenite::WebSocketStream;
 
     use super::WebSocketRw;
 
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index e2fc73235e..89df48c5d3 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -1,19 +1,20 @@
-use crate::config::TlsServerEndPoint;
-use crate::error::{ErrorKind, ReportableError, UserFacingError};
-use crate::metrics::Metrics;
-use bytes::BytesMut;
-
-use pq_proto::framed::{ConnectionError, Framed};
-use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError};
-use rustls::ServerConfig;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::{io, task};
+
+use bytes::BytesMut;
+use pq_proto::framed::{ConnectionError, Framed};
+use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError};
+use rustls::ServerConfig;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
 use tokio_rustls::server::TlsStream;
 use tracing::debug;
 
+use crate::config::TlsServerEndPoint;
+use crate::error::{ErrorKind, ReportableError, UserFacingError};
+use crate::metrics::Metrics;
+
 /// Stream wrapper which implements libpq's protocol.
 ///
 /// NOTE: This object deliberately doesn't implement [`AsyncRead`]
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index ee36ed462d..c5384c0b0e 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -1,36 +1,33 @@
 //! Periodically collect proxy consumption metrics
 //! and push them to a HTTP endpoint.
-use crate::{
-    config::{MetricBackupCollectionConfig, MetricCollectionConfig},
-    context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
-    http,
-    intern::{BranchIdInt, EndpointIdInt},
-};
+use std::convert::Infallible;
+use std::pin::pin;
+use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
+use std::sync::Arc;
+use std::time::Duration;
+
 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Bytes;
 use chrono::{DateTime, Datelike, Timelike, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
-use dashmap::{mapref::entry::Entry, DashMap};
+use dashmap::mapref::entry::Entry;
+use dashmap::DashMap;
 use futures::future::select;
 use once_cell::sync::Lazy;
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use serde::{Deserialize, Serialize};
-use std::{
-    convert::Infallible,
-    pin::pin,
-    sync::{
-        atomic::{AtomicU64, AtomicUsize, Ordering},
-        Arc,
-    },
-    time::Duration,
-};
 use tokio::io::AsyncWriteExt;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, instrument, trace, warn};
 use utils::backoff;
 use uuid::{NoContext, Timestamp};
 
+use crate::config::{MetricBackupCollectionConfig, MetricCollectionConfig};
+use crate::context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD};
+use crate::http;
+use crate::intern::{BranchIdInt, EndpointIdInt};
+
 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
 
 const HTTP_REPORTING_REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
@@ -485,19 +482,23 @@ async fn upload_events_chunk(
 
 #[cfg(test)]
 mod tests {
-    use super::*;
+    use std::sync::{Arc, Mutex};
 
-    use crate::{http, BranchId, EndpointId};
     use anyhow::Error;
     use chrono::Utc;
     use consumption_metrics::{Event, EventChunk};
     use http_body_util::BodyExt;
-    use hyper::{body::Incoming, server::conn::http1, service::service_fn, Request, Response};
+    use hyper::body::Incoming;
+    use hyper::server::conn::http1;
+    use hyper::service::service_fn;
+    use hyper::{Request, Response};
     use hyper_util::rt::TokioIo;
-    use std::sync::{Arc, Mutex};
     use tokio::net::TcpListener;
     use url::Url;
 
+    use super::*;
+    use crate::{http, BranchId, EndpointId};
+
     #[tokio::test]
     async fn metrics() {
         type Report = EventChunk<'static, Event<Ids, String>>;
diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs
index 86d0f9e8b2..7e07f6a2af 100644
--- a/proxy/src/waiters.rs
+++ b/proxy/src/waiters.rs
@@ -1,8 +1,9 @@
+use std::pin::Pin;
+use std::task;
+
 use hashbrown::HashMap;
 use parking_lot::Mutex;
 use pin_project_lite::pin_project;
-use std::pin::Pin;
-use std::task;
 use thiserror::Error;
 use tokio::sync::oneshot;
 
@@ -99,9 +100,10 @@ impl<T> std::future::Future for Waiter<'_, T> {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use std::sync::Arc;
 
+    use super::*;
+
     #[tokio::test]
     async fn test_waiter() -> anyhow::Result<()> {
         let waiters = Arc::new(Waiters::default());

From d490ad23e0948b7c49098638ffc669774c61049e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 16 Oct 2024 14:04:17 +0100
Subject: [PATCH 22/57] storcon: use the same trace fields for reconciler and
 results (#9410)

## Problem

The reconciler use `seq`, but processing of results uses `sequence`.
Order is different too. It makes it annoying to read logs.

## Summary of Changes

Use the same tracing fields in both
---
 storage_controller/src/service.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index cedee54534..25e1fb5e1f 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1074,8 +1074,9 @@ impl Service {
     /// the observed state of the tenant such that subsequent calls to [`TenantShard::get_reconcile_needed`]
     /// will indicate that reconciliation is not needed.
     #[instrument(skip_all, fields(
-        tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(),
-        sequence=%result.sequence
+        seq=%result.sequence,
+        tenant_id=%result.tenant_shard_id.tenant_id,
+        shard_id=%result.tenant_shard_id.shard_slug(),
     ))]
     fn process_result(&self, result: ReconcileResult) {
         let mut locked = self.inner.write().unwrap();

From d6281cbe65db6959e83c6d8abb44c0a3184e8b97 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 16 Oct 2024 15:27:46 +0100
Subject: [PATCH 23/57] tests: stabilize test_timelines_parallel_endpoints
 (#9413)

## Problem

This test would get failures like `command failed: Found no timeline id
for branch name 'branch_8'`

It's because neon_local is being invoked concurrently for branch
creation, which is unsafe (they'll step on each others' JSON writes)

Example failure:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9410/11363051979/index.html#testresult/5ddc56c640f5422b/retries

## Summary of changes

- Don't do branch creation concurrently with endpoint creation via neon_local
---
 test_runner/regress/test_tenants.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 4a16535941..03cb79fc1d 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -19,6 +19,7 @@ from fixtures.metrics import (
     parse_metrics,
 )
 from fixtures.neon_fixtures import (
+    Endpoint,
     NeonEnv,
     NeonEnvBuilder,
     wait_for_last_flush_lsn,
@@ -490,8 +491,8 @@ def test_timelines_parallel_endpoints(neon_simple_env: NeonEnv):
     n_threads = 16
     barrier = threading.Barrier(n_threads)
 
-    def test_timeline(branch_name: str, timeline_id: TimelineId):
-        endpoint = env.endpoints.create_start(branch_name)
+    def test_timeline(branch_name: str, timeline_id: TimelineId, endpoint: Endpoint):
+        endpoint.start()
         endpoint.stop()
         # Use a barrier to make sure we restart endpoints at the same time
         barrier.wait()
@@ -502,8 +503,12 @@ def test_timelines_parallel_endpoints(neon_simple_env: NeonEnv):
     for i in range(0, n_threads):
         branch_name = f"branch_{i}"
         timeline_id = env.create_branch(branch_name)
-        w = threading.Thread(target=test_timeline, args=[branch_name, timeline_id])
+        endpoint = env.endpoints.create(branch_name)
+        w = threading.Thread(target=test_timeline, args=[branch_name, timeline_id, endpoint])
         workers.append(w)
+
+    # Only start the restarts once we're done creating all timelines & endpoints
+    for w in workers:
         w.start()
 
     for w in workers:

From 3140c14d608e79d792518d9d9144460b6ff01b0f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 16 Oct 2024 16:28:55 +0200
Subject: [PATCH 24/57] Remove allow(clippy::unknown_lints) (#9416)

the lint stabilized in 1.80.
---
 pageserver/src/tenant/timeline.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8f098d0e82..1992dee930 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3092,7 +3092,6 @@ impl Timeline {
 }
 
 impl Timeline {
-    #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint
     #[allow(clippy::doc_lazy_continuation)]
     /// Get the data needed to reconstruct all keys in the provided keyspace
     ///

From 9668601f4666bd82cee653800433ce66a4d9fb21 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Wed, 16 Oct 2024 15:29:23 +0100
Subject: [PATCH 25/57] Add support of extensions for v17 (part 2) (#9389)

- plv8 3.2.3
    - HypoPG 1.4.1
    - pgtap 1.3.3
    - timescaledb 2.17.0
    - pg_hint_plan 17_1_7_0
    - rdkit Release_2024_09_1
    - pg_uuidv7 1.6.0
    - wal2json 2.6
    - pg_ivm 1.9
    - pg_partman 5.1.0

    update support of extensions for v14-v16:
    - HypoPG 1.4.0 -> 1.4.1
    - pgtap 1.2.0 -> 1.3.3
    - plpgsql_check 2.5.3 -> 2.7.11
    - pg_uuidv7 1.0.1 -> 1.6.0
    - wal2json 2.5 -> 2.6
    - pg_ivm 1.7 -> 1.9
    - pg_partman 5.0.1 -> 5.1.0
---
 compute/Dockerfile.compute-node | 182 ++++++++++++++++++++------------
 1 file changed, 114 insertions(+), 68 deletions(-)

diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node
index 13381b2901..f05039f8b7 100644
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -18,13 +18,14 @@ RUN case $DEBIAN_VERSION in \
       # Version-specific installs for Bullseye (PG14-PG16):
       # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18.
       # Install newer version (3.25) from backports.
+      # libstdc++-10-dev is required for plv8
       bullseye) \
         echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/bullseye-backports.list; \
-        VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports"; \
+        VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports libstdc++-10-dev"; \
       ;; \
       # Version-specific installs for Bookworm (PG17):
       bookworm) \
-        VERSION_INSTALLS="cmake"; \
+        VERSION_INSTALLS="cmake libstdc++-12-dev"; \
       ;; \
       *) \
         echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \
@@ -227,18 +228,33 @@ FROM build-deps AS plv8-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    apt update && \
+RUN apt update && \
     apt install --no-install-recommends -y ninja-build python3-dev libncurses5 binutils clang
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+# plv8 3.2.3 supports v17
+# last release v3.2.3 - Sep 7, 2024
+#
+# clone the repo instead of downloading the release tarball because plv8 has submodule dependencies
+# and the release tarball doesn't include them
+#
+# Use new version only for v17
+# because since v3.2, plv8 doesn't include plcoffee and plls extensions
+ENV PLV8_TAG=v3.2.3
+
+RUN case "${PG_VERSION}" in \
+    "v17") \
+        export PLV8_TAG=v3.2.3 \
+    ;; \
+    "v14" | "v15" | "v16") \
+        export PLV8_TAG=v3.1.10 \
+    ;; \
+    *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+    ;; \
     esac && \
-    wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
-    echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
-    mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \
+    git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \
+    tar -czf plv8.tar.gz --exclude .git plv8-src && \
+    cd plv8-src && \
     # generate and copy upgrade scripts
     mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
     cp upgrade/* /usr/local/pgsql/share/extension/ && \
@@ -248,8 +264,17 @@ RUN case "${PG_VERSION}" in "v17") \
     find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
     # don't break computes with installed old version of plv8
     cd /usr/local/pgsql/lib/ && \
-    ln -s plv8-3.1.10.so plv8-3.1.5.so && \
-    ln -s plv8-3.1.10.so plv8-3.1.8.so && \
+    case "${PG_VERSION}" in \
+    "v17") \
+        ln -s plv8-3.2.3.so plv8-3.1.8.so && \
+        ln -s plv8-3.2.3.so plv8-3.1.5.so && \
+        ln -s plv8-3.2.3.so plv8-3.1.10.so \
+    ;; \
+    "v14" | "v15" | "v16") \
+        ln -s plv8-3.1.10.so plv8-3.1.5.so && \
+        ln -s plv8-3.1.10.so plv8-3.1.8.so \
+    ;; \
+    esac && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
@@ -327,6 +352,9 @@ COPY compute/patches/pgvector.patch /pgvector.patch
 # By default, pgvector Makefile uses `-march=native`. We don't want that,
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
+#
+# v17 is not supported yet because of upstream issue
+# https://github.com/pgvector/pgvector/issues/669
 RUN case "${PG_VERSION}" in "v17") \
     echo "v17 extensions are not supported yet. Quit" && exit 0;; \
     esac && \
@@ -366,11 +394,10 @@ FROM build-deps AS hypopg-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
-    echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
+# HypoPG 1.4.1 supports v17
+# last release 1.4.1 - Apr 28, 2024
+RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \
+    echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \
     mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -407,6 +434,9 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 COPY compute/patches/rum.patch /rum.patch
 
+# maybe version-specific
+# support for v17 is unknown
+# last release 1.3.13 - Sep 19, 2022
 RUN case "${PG_VERSION}" in "v17") \
     echo "v17 extensions are not supported yet. Quit" && exit 0;; \
     esac && \
@@ -428,11 +458,10 @@ FROM build-deps AS pgtap-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
-    echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \
+# pgtap 1.3.3 supports v17
+# last release v1.3.3 - Apr 8, 2024
+RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \
+    echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \
     mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -505,11 +534,10 @@ FROM build-deps AS plpgsql-check-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
-    echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
+# plpgsql_check v2.7.11 supports v17
+# last release v2.7.11 - Sep 16, 2024
+RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \
+    echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \
     mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -527,18 +555,19 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ARG PG_VERSION
 ENV PATH="/usr/local/pgsql/bin:$PATH"
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    case "${PG_VERSION}" in \
+RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
         export TIMESCALEDB_VERSION=2.10.1 \
         export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
         ;; \
-      *) \
+      "v16") \
         export TIMESCALEDB_VERSION=2.13.0 \
         export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
         ;; \
+      "v17") \
+        export TIMESCALEDB_VERSION=2.17.0 \
+        export TIMESCALEDB_CHECKSUM=155bf64391d3558c42f31ca0e523cfc6252921974f75298c9039ccad1c89811a \
+        ;; \
     esac && \
     wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
     echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
@@ -561,10 +590,8 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ARG PG_VERSION
 ENV PATH="/usr/local/pgsql/bin:$PATH"
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    case "${PG_VERSION}" in \
+# version-specific, has separate releases for each version
+RUN case "${PG_VERSION}" in \
       "v14") \
         export PG_HINT_PLAN_VERSION=14_1_4_1 \
         export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \
@@ -578,7 +605,8 @@ RUN case "${PG_VERSION}" in "v17") \
         export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \
         ;; \
       "v17") \
-        echo "TODO: PG17 pg_hint_plan support" && exit 0 \
+        export PG_HINT_PLAN_VERSION=17_1_7_0 \
+        export PG_HINT_PLAN_CHECKSUM=06dd306328c67a4248f48403c50444f30959fb61ebe963248dbc2afb396fe600 \
         ;; \
       *) \
         echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \
@@ -602,6 +630,10 @@ FROM build-deps AS pg-cron-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+# 1.6.4 available, supports v17
+# This is an experimental extension that we do not support on prod yet.
+# !Do not remove!
+# We set it in shared_preload_libraries and computes will fail to start if library is not found.
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in "v17") \
     echo "v17 extensions are not supported yet. Quit" && exit 0;; \
@@ -623,23 +655,37 @@ FROM build-deps AS rdkit-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    apt-get update && \
+RUN apt-get update && \
     apt-get install --no-install-recommends -y \
         libboost-iostreams1.74-dev \
         libboost-regex1.74-dev \
         libboost-serialization1.74-dev \
         libboost-system1.74-dev \
-        libeigen3-dev
+        libeigen3-dev \
+        libboost-all-dev
 
+# rdkit Release_2024_09_1 supports v17
+# last release Release_2024_09_1 - Sep 27, 2024
+#
+# Use new version only for v17
+# because Release_2024_09_1 has some backward incompatible changes
+# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1 
 ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+RUN case "${PG_VERSION}" in \
+    "v17") \
+        export RDKIT_VERSION=Release_2024_09_1 \
+        export RDKIT_CHECKSUM=034c00d6e9de323506834da03400761ed8c3721095114369d06805409747a60f \
+    ;; \
+    "v14" | "v15" | "v16") \
+        export RDKIT_VERSION=Release_2023_03_3 \
+        export RDKIT_CHECKSUM=bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d \
+    ;; \
+    *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+    ;; \
     esac && \
-    wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
-    echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
+    wget https://github.com/rdkit/rdkit/archive/refs/tags/${RDKIT_VERSION}.tar.gz -O rdkit.tar.gz && \
+    echo "${RDKIT_CHECKSUM} rdkit.tar.gz" | sha256sum --check && \
     mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
     cmake \
         -D RDK_BUILD_CAIRO_SUPPORT=OFF \
@@ -678,12 +724,11 @@ FROM build-deps AS pg-uuidv7-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+# not version-specific
+# last release v1.6.0 - Oct 9, 2024
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
-    echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \
+    echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \
     mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -754,6 +799,8 @@ RUN case "${PG_VERSION}" in \
 FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+# This is our extension, support stopped in favor of pgvector
+# TODO: deprecate it
 ARG PG_VERSION
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
@@ -780,6 +827,8 @@ FROM build-deps AS pg-anon-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+# This is an experimental extension, never got to real production.
+# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in "v17") \
     echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
@@ -946,13 +995,12 @@ FROM build-deps AS wal2json-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+# wal2json wal2json_2_6 supports v17
+# last release wal2json_2_6 - Apr 25, 2024
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "We'll need to update wal2json to 2.6+ for pg17 support" && exit 0;; \
-    esac && \
-    wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
-    echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
-    mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
+RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \
+    echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \
+    mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install
 
@@ -966,12 +1014,11 @@ FROM build-deps AS pg-ivm-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+# pg_ivm v1.9 supports v17
+# last release v1.9 - Jul 31
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "We'll need to update pg_ivm to 1.9+ for pg17 support" && exit 0;; \
-    esac && \
-    wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
-    echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \
+    echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \
     mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -987,12 +1034,11 @@ FROM build-deps AS pg-partman-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+# should support v17 https://github.com/pgpartman/pg_partman/discussions/693
+# last release 5.1.0  Apr 2, 2024
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "pg_partman doesn't support PG17 yet" && exit 0;; \
-    esac && \
-    wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
-    echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \
+    echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \
     mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \

From 55b246085ea30341f2479ecfadff374a5487e74d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 16 Oct 2024 16:47:17 +0200
Subject: [PATCH 26/57] Activate timelines during unoffload (#9399)

The current code has forgotten to activate timelines during unoffload,
leading to inability to receive the basebackup, due to the timeline
still being in loading state.

```
  stderr:
    command failed: compute startup failed: failed to get basebackup@0/0 from pageserver postgresql://no_user@localhost:15014

    Caused by:
        0: db error: ERROR: Not found: Timeline 508546c79b2b16a84ab609fdf966e0d3/bfc18c24c4b837ecae5dbb5216c80fce is not active, state: Loading
        1: ERROR: Not found: Timeline 508546c79b2b16a84ab609fdf966e0d3/bfc18c24c4b837ecae5dbb5216c80fce is not active, state: Loading
```

Therefore, also activate the timeline during unoffloading.

Part of #8088
---
 pageserver/src/http/routes.rs                |  7 +++-
 pageserver/src/tenant.rs                     | 40 +++++++++++++-------
 test_runner/regress/test_timeline_archive.py | 17 +++++++++
 3 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index dd403c1cef..36a6ed427b 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -720,7 +720,12 @@ async fn timeline_archival_config_handler(
         tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
 
         tenant
-            .apply_timeline_archival_config(timeline_id, request_data.state, ctx)
+            .apply_timeline_archival_config(
+                timeline_id,
+                request_data.state,
+                state.broker_client.clone(),
+                ctx,
+            )
             .await?;
         Ok::<_, ApiError>(())
     }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 20925c7fd6..689982ddd4 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1554,6 +1554,7 @@ impl Tenant {
     async fn unoffload_timeline(
         self: &Arc<Self>,
         timeline_id: TimelineId,
+        broker_client: storage_broker::BrokerClientChannel,
         ctx: RequestContext,
     ) -> Result<Arc<Timeline>, TimelineArchivalError> {
         info!("unoffloading timeline");
@@ -1605,25 +1606,37 @@ impl Tenant {
         })
         .map_err(TimelineArchivalError::Other)?;
         let timelines = self.timelines.lock().unwrap();
-        if let Some(timeline) = timelines.get(&timeline_id) {
-            let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap();
-            if offloaded_timelines.remove(&timeline_id).is_none() {
-                warn!("timeline already removed from offloaded timelines");
-            }
-            info!("timeline unoffloading complete");
-            Ok(Arc::clone(timeline))
-        } else {
+        let Some(timeline) = timelines.get(&timeline_id) else {
             warn!("timeline not available directly after attach");
-            Err(TimelineArchivalError::Other(anyhow::anyhow!(
+            return Err(TimelineArchivalError::Other(anyhow::anyhow!(
                 "timeline not available directly after attach"
-            )))
+            )));
+        };
+        let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap();
+        if offloaded_timelines.remove(&timeline_id).is_none() {
+            warn!("timeline already removed from offloaded timelines");
         }
+
+        // Activate the timeline (if it makes sense)
+        if !(timeline.is_broken() || timeline.is_stopping()) {
+            let background_jobs_can_start = None;
+            timeline.activate(
+                self.clone(),
+                broker_client.clone(),
+                background_jobs_can_start,
+                &ctx,
+            );
+        }
+
+        info!("timeline unoffloading complete");
+        Ok(Arc::clone(timeline))
     }
 
     pub(crate) async fn apply_timeline_archival_config(
         self: &Arc<Self>,
         timeline_id: TimelineId,
         new_state: TimelineArchivalState,
+        broker_client: storage_broker::BrokerClientChannel,
         ctx: RequestContext,
     ) -> Result<(), TimelineArchivalError> {
         info!("setting timeline archival config");
@@ -1664,12 +1677,13 @@ impl Tenant {
             Some(Arc::clone(timeline))
         };
 
-        // Second part: unarchive timeline (if needed)
+        // Second part: unoffload timeline (if needed)
         let timeline = if let Some(timeline) = timeline_or_unarchive_offloaded {
             timeline
         } else {
             // Turn offloaded timeline into a non-offloaded one
-            self.unoffload_timeline(timeline_id, ctx).await?
+            self.unoffload_timeline(timeline_id, broker_client, ctx)
+                .await?
         };
 
         // Third part: upload new timeline archival state and block until it is present in S3
@@ -3354,7 +3368,7 @@ impl Tenant {
     /// Populate all Timelines' `GcInfo` with information about their children.  We do not set the
     /// PITR cutoffs here, because that requires I/O: this is done later, before GC, by [`Self::refresh_gc_info_internal`]
     ///
-    /// Subsequently, parent-child relationships are updated incrementally during timeline creation/deletion.
+    /// Subsequently, parent-child relationships are updated incrementally inside [`Timeline::new`] and [`Timeline::drop`].
     fn initialize_gc_info(
         &self,
         timelines: &std::sync::MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 971cc57a1c..ffaed5e130 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -136,6 +136,17 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
         "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent"
     )
 
+    with env.endpoints.create_start(
+        "test_ancestor_branch_archive_branch1", tenant_id=tenant_id
+    ) as endpoint:
+        endpoint.safe_psql_many(
+            [
+                "CREATE TABLE foo(key serial primary key, t text default 'data_content')",
+                "INSERT INTO foo SELECT FROM generate_series(1,1000)",
+            ]
+        )
+        sum = endpoint.safe_psql("SELECT sum(key) from foo where key > 50")
+
     ps_http.timeline_archival_config(
         tenant_id,
         leaf_timeline_id,
@@ -197,4 +208,10 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
     )
     assert leaf_detail["is_archived"] is False
 
+    with env.endpoints.create_start(
+        "test_ancestor_branch_archive_branch1", tenant_id=tenant_id
+    ) as endpoint:
+        sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key > 50")
+        assert sum == sum_again
+
     assert not timeline_offloaded(initial_timeline_id)

From 8a114e3aeda7a2e321fa4524335c1748448cae07 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 16 Oct 2024 11:19:45 -0400
Subject: [PATCH 27/57] refactor(pageserver): upgrade remote_storage to use
 hyper1 (#9405)

part of https://github.com/neondatabase/neon/issues/9255

## Summary of changes

Upgrade remote_storage crate to use hyper1. Hyper0 is used when
providing the streaming HTTP body to the s3 SDK, and it is refactored to
use hyper1.


Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 Cargo.lock                           | 3 ++-
 libs/remote_storage/Cargo.toml       | 3 ++-
 libs/remote_storage/src/s3_bucket.rs | 8 +++++---
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7e772814ec..6b212bac2e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4648,9 +4648,10 @@ dependencies = [
  "camino-tempfile",
  "futures",
  "futures-util",
+ "http-body-util",
  "http-types",
  "humantime-serde",
- "hyper 0.14.30",
+ "hyper 1.4.1",
  "itertools 0.10.5",
  "metrics",
  "once_cell",
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index be4d61f009..1816825bda 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -16,7 +16,7 @@ aws-sdk-s3.workspace = true
 bytes.workspace = true
 camino = { workspace = true, features = ["serde1"] }
 humantime-serde.workspace = true
-hyper0 = { workspace = true, features = ["stream"] }
+hyper = { workspace = true, features = ["client"] }
 futures.workspace = true
 serde.workspace = true
 serde_json.workspace = true
@@ -36,6 +36,7 @@ azure_storage.workspace = true
 azure_storage_blobs.workspace = true
 futures-util.workspace = true
 http-types.workspace = true
+http-body-util.workspace = true
 itertools.workspace = true
 sync_wrapper = { workspace = true, features = ["futures"] }
 
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index f950f2886c..cde32df402 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -28,13 +28,15 @@ use aws_sdk_s3::{
     Client,
 };
 use aws_smithy_async::rt::sleep::TokioSleep;
+use http_body_util::StreamBody;
 use http_types::StatusCode;
 
 use aws_smithy_types::{body::SdkBody, DateTime};
 use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError};
 use bytes::Bytes;
 use futures::stream::Stream;
-use hyper0::Body;
+use futures_util::StreamExt;
+use hyper::body::Frame;
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use utils::backoff;
@@ -710,8 +712,8 @@ impl RemoteStorage for S3Bucket {
 
         let started_at = start_measuring_requests(kind);
 
-        let body = Body::wrap_stream(from);
-        let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
+        let body = StreamBody::new(from.map(|x| x.map(Frame::data)));
+        let bytes_stream = ByteStream::new(SdkBody::from_body_1_x(body));
 
         let upload = self
             .client

From ed694732e707b15592991902c89f5078935ec177 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 16 Oct 2024 19:10:49 +0200
Subject: [PATCH 28/57] proxy: merge AuthError and AuthErrorImpl (#9418)

Since GetAuthInfoError now boxes the ControlPlaneError message the
variant is not big anymore and AuthError is 32 bytes.
---
 proxy/src/auth/flow.rs | 10 +++---
 proxy/src/auth/mod.rs  | 78 ++++++++++++++++++------------------------
 2 files changed, 39 insertions(+), 49 deletions(-)

diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index ccb17b66b9..6294549ff6 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -9,7 +9,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 
 use super::backend::ComputeCredentialKeys;
-use super::{AuthErrorImpl, PasswordHackPayload};
+use super::{AuthError, PasswordHackPayload};
 use crate::config::TlsServerEndPoint;
 use crate::context::RequestMonitoring;
 use crate::control_plane::AuthSecret;
@@ -117,14 +117,14 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
         let msg = self.stream.read_password_message().await?;
         let password = msg
             .strip_suffix(&[0])
-            .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
+            .ok_or(AuthError::MalformedPassword("missing terminator"))?;
 
         let payload = PasswordHackPayload::parse(password)
             // If we ended up here and the payload is malformed, it means that
             // the user neither enabled SNI nor resorted to any other method
             // for passing the project name we rely on. We should show them
             // the most helpful error message and point to the documentation.
-            .ok_or(AuthErrorImpl::MissingEndpointName)?;
+            .ok_or(AuthError::MissingEndpointName)?;
 
         Ok(payload)
     }
@@ -136,7 +136,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
         let msg = self.stream.read_password_message().await?;
         let password = msg
             .strip_suffix(&[0])
-            .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
+            .ok_or(AuthError::MalformedPassword("missing terminator"))?;
 
         let outcome = validate_password_and_exchange(
             &self.state.pool,
@@ -166,7 +166,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
         // Initial client message contains the chosen auth method's name.
         let msg = self.stream.read_password_message().await?;
         let sasl = sasl::FirstMessage::parse(&msg)
-            .ok_or(AuthErrorImpl::MalformedPassword("bad sasl message"))?;
+            .ok_or(AuthError::MalformedPassword("bad sasl message"))?;
 
         // Currently, the only supported SASL method is SCRAM.
         if !scram::METHODS.contains(&sasl.method) {
diff --git a/proxy/src/auth/mod.rs b/proxy/src/auth/mod.rs
index ff97e6c35d..7a373dd825 100644
--- a/proxy/src/auth/mod.rs
+++ b/proxy/src/auth/mod.rs
@@ -29,7 +29,7 @@ pub(crate) type Result<T> = std::result::Result<T, AuthError>;
 
 /// Common authentication error.
 #[derive(Debug, Error)]
-pub(crate) enum AuthErrorImpl {
+pub(crate) enum AuthError {
     #[error(transparent)]
     Web(#[from] backend::WebAuthError),
 
@@ -78,80 +78,70 @@ pub(crate) enum AuthErrorImpl {
     ConfirmationTimeout(humantime::Duration),
 }
 
-#[derive(Debug, Error)]
-#[error(transparent)]
-pub(crate) struct AuthError(Box<AuthErrorImpl>);
-
 impl AuthError {
     pub(crate) fn bad_auth_method(name: impl Into<Box<str>>) -> Self {
-        AuthErrorImpl::BadAuthMethod(name.into()).into()
+        AuthError::BadAuthMethod(name.into())
     }
 
     pub(crate) fn auth_failed(user: impl Into<Box<str>>) -> Self {
-        AuthErrorImpl::AuthFailed(user.into()).into()
+        AuthError::AuthFailed(user.into())
     }
 
     pub(crate) fn ip_address_not_allowed(ip: IpAddr) -> Self {
-        AuthErrorImpl::IpAddressNotAllowed(ip).into()
+        AuthError::IpAddressNotAllowed(ip)
     }
 
     pub(crate) fn too_many_connections() -> Self {
-        AuthErrorImpl::TooManyConnections.into()
+        AuthError::TooManyConnections
     }
 
     pub(crate) fn is_auth_failed(&self) -> bool {
-        matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_))
+        matches!(self, AuthError::AuthFailed(_))
     }
 
     pub(crate) fn user_timeout(elapsed: Elapsed) -> Self {
-        AuthErrorImpl::UserTimeout(elapsed).into()
+        AuthError::UserTimeout(elapsed)
     }
 
     pub(crate) fn confirmation_timeout(timeout: humantime::Duration) -> Self {
-        AuthErrorImpl::ConfirmationTimeout(timeout).into()
-    }
-}
-
-impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
-    fn from(e: E) -> Self {
-        Self(Box::new(e.into()))
+        AuthError::ConfirmationTimeout(timeout)
     }
 }
 
 impl UserFacingError for AuthError {
     fn to_string_client(&self) -> String {
-        match self.0.as_ref() {
-            AuthErrorImpl::Web(e) => e.to_string_client(),
-            AuthErrorImpl::GetAuthInfo(e) => e.to_string_client(),
-            AuthErrorImpl::Sasl(e) => e.to_string_client(),
-            AuthErrorImpl::AuthFailed(_) => self.to_string(),
-            AuthErrorImpl::BadAuthMethod(_) => self.to_string(),
-            AuthErrorImpl::MalformedPassword(_) => self.to_string(),
-            AuthErrorImpl::MissingEndpointName => self.to_string(),
-            AuthErrorImpl::Io(_) => "Internal error".to_string(),
-            AuthErrorImpl::IpAddressNotAllowed(_) => self.to_string(),
-            AuthErrorImpl::TooManyConnections => self.to_string(),
-            AuthErrorImpl::UserTimeout(_) => self.to_string(),
-            AuthErrorImpl::ConfirmationTimeout(_) => self.to_string(),
+        match self {
+            Self::Web(e) => e.to_string_client(),
+            Self::GetAuthInfo(e) => e.to_string_client(),
+            Self::Sasl(e) => e.to_string_client(),
+            Self::AuthFailed(_) => self.to_string(),
+            Self::BadAuthMethod(_) => self.to_string(),
+            Self::MalformedPassword(_) => self.to_string(),
+            Self::MissingEndpointName => self.to_string(),
+            Self::Io(_) => "Internal error".to_string(),
+            Self::IpAddressNotAllowed(_) => self.to_string(),
+            Self::TooManyConnections => self.to_string(),
+            Self::UserTimeout(_) => self.to_string(),
+            Self::ConfirmationTimeout(_) => self.to_string(),
         }
     }
 }
 
 impl ReportableError for AuthError {
     fn get_error_kind(&self) -> crate::error::ErrorKind {
-        match self.0.as_ref() {
-            AuthErrorImpl::Web(e) => e.get_error_kind(),
-            AuthErrorImpl::GetAuthInfo(e) => e.get_error_kind(),
-            AuthErrorImpl::Sasl(e) => e.get_error_kind(),
-            AuthErrorImpl::AuthFailed(_) => crate::error::ErrorKind::User,
-            AuthErrorImpl::BadAuthMethod(_) => crate::error::ErrorKind::User,
-            AuthErrorImpl::MalformedPassword(_) => crate::error::ErrorKind::User,
-            AuthErrorImpl::MissingEndpointName => crate::error::ErrorKind::User,
-            AuthErrorImpl::Io(_) => crate::error::ErrorKind::ClientDisconnect,
-            AuthErrorImpl::IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
-            AuthErrorImpl::TooManyConnections => crate::error::ErrorKind::RateLimit,
-            AuthErrorImpl::UserTimeout(_) => crate::error::ErrorKind::User,
-            AuthErrorImpl::ConfirmationTimeout(_) => crate::error::ErrorKind::User,
+        match self {
+            Self::Web(e) => e.get_error_kind(),
+            Self::GetAuthInfo(e) => e.get_error_kind(),
+            Self::Sasl(e) => e.get_error_kind(),
+            Self::AuthFailed(_) => crate::error::ErrorKind::User,
+            Self::BadAuthMethod(_) => crate::error::ErrorKind::User,
+            Self::MalformedPassword(_) => crate::error::ErrorKind::User,
+            Self::MissingEndpointName => crate::error::ErrorKind::User,
+            Self::Io(_) => crate::error::ErrorKind::ClientDisconnect,
+            Self::IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
+            Self::TooManyConnections => crate::error::ErrorKind::RateLimit,
+            Self::UserTimeout(_) => crate::error::ErrorKind::User,
+            Self::ConfirmationTimeout(_) => crate::error::ErrorKind::User,
         }
     }
 }

From 0551cfb6a74258537255af18428b0345f24f2702 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 16 Oct 2024 20:04:56 +0200
Subject: [PATCH 29/57] Fix beta clippy warnings (#9419)

```
warning: first doc comment paragraph is too long
  --> compute_tools/src/installed_extensions.rs:35:1
   |
35 | / /// Connect to every database (see list_dbs above) and get the list of installed extensions.
36 | | /// Same extension can be installed in multiple databases with different versions,
37 | | /// we only keep the highest and lowest version across all databases.
   | |_
   |
   = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#too_long_first_doc_paragraph
   = note: `#[warn(clippy::too_long_first_doc_paragraph)]` on by default
help: add an empty line
   |
35 ~ /// Connect to every database (see list_dbs above) and get the list of installed extensions.
36 + ///
   |
```
---
 compute_tools/src/installed_extensions.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs
index 3d8b22a8a3..72578b1f34 100644
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -33,6 +33,7 @@ fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
 }
 
 /// Connect to every database (see list_dbs above) and get the list of installed extensions.
+///
 /// Same extension can be installed in multiple databases with different versions,
 /// we only keep the highest and lowest version across all databases.
 pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtensions> {

From 409a286eaa6f030494c8914fcaa36dcc7d6496d1 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 16 Oct 2024 13:08:40 -0500
Subject: [PATCH 30/57] Fix typo in sql_exporter generator

Bad copy-paste seemingly. This manifested itself as a failure to start
for the sql_exporter, and was just dying on loop in staging. A future PR
will have E2E testing of sql_exporter.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/etc/sql_exporter.jsonnet | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compute/etc/sql_exporter.jsonnet b/compute/etc/sql_exporter.jsonnet
index 1e3665ac47..640e2ac38d 100644
--- a/compute/etc/sql_exporter.jsonnet
+++ b/compute/etc/sql_exporter.jsonnet
@@ -28,7 +28,7 @@ function(collector_file, application_name='sql_exporter') {
     // Collectors (referenced by name) to execute on the target.
     // Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
     collectors: [
-      'neon_collector_autoscaling',
+      'neon_collector',
     ],
   },
 

From e0fa6bcf1a9a33929cfcfd0cefada739a8fe6fea Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 16 Oct 2024 14:46:33 -0500
Subject: [PATCH 31/57] Fix some sql_exporter metrics for PG 17

Checkpointer related statistics moved from pg_stat_bgwriter to
pg_stat_checkpointer, so we need to adjust our queries accordingly.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/Dockerfile.compute-node                  |  3 ++-
 compute/Makefile                                 |  6 ++++--
 compute/etc/sql_exporter/checkpoints_req.17.sql  |  1 +
 .../etc/sql_exporter/checkpoints_req.libsonnet   |  7 ++++++-
 .../etc/sql_exporter/checkpoints_timed.17.sql    |  1 +
 .../etc/sql_exporter/checkpoints_timed.libsonnet |  7 ++++++-
 compute/jsonnet/neon.libsonnet                   | 16 ++++++++++++++++
 7 files changed, 36 insertions(+), 5 deletions(-)
 create mode 100644 compute/etc/sql_exporter/checkpoints_req.17.sql
 create mode 100644 compute/etc/sql_exporter/checkpoints_timed.17.sql
 create mode 100644 compute/jsonnet/neon.libsonnet

diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node
index f05039f8b7..b0ce7c1718 100644
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -1221,12 +1221,13 @@ RUN rm /usr/local/pgsql/lib/lib*.a
 #
 #########################################################################################
 FROM $REPOSITORY/$IMAGE:$TAG AS sql_exporter_preprocessor
+ARG PG_VERSION
 
 USER nonroot
 
 COPY --chown=nonroot compute compute
 
-RUN make -C compute
+RUN make PG_VERSION="${PG_VERSION}" -C compute
 
 #########################################################################################
 #
diff --git a/compute/Makefile b/compute/Makefile
index f8faa882ee..e4f08a223c 100644
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -6,13 +6,15 @@ jsonnet_files = $(wildcard \
 all: neon_collector.yml neon_collector_autoscaling.yml sql_exporter.yml sql_exporter_autoscaling.yml
 
 neon_collector.yml: $(jsonnet_files)
-	JSONNET_PATH=etc jsonnet \
+	JSONNET_PATH=jsonnet:etc jsonnet \
 		--output-file etc/$@ \
+		--ext-str pg_version=$(PG_VERSION) \
 		etc/neon_collector.jsonnet
 
 neon_collector_autoscaling.yml: $(jsonnet_files)
-	JSONNET_PATH=etc jsonnet \
+	JSONNET_PATH=jsonnet:etc jsonnet \
 		--output-file etc/$@ \
+		--ext-str pg_version=$(PG_VERSION) \
 		etc/neon_collector_autoscaling.jsonnet
 
 sql_exporter.yml: $(jsonnet_files)
diff --git a/compute/etc/sql_exporter/checkpoints_req.17.sql b/compute/etc/sql_exporter/checkpoints_req.17.sql
new file mode 100644
index 0000000000..a4b946e8e2
--- /dev/null
+++ b/compute/etc/sql_exporter/checkpoints_req.17.sql
@@ -0,0 +1 @@
+SELECT num_requested AS checkpoints_req FROM pg_stat_checkpointer;
diff --git a/compute/etc/sql_exporter/checkpoints_req.libsonnet b/compute/etc/sql_exporter/checkpoints_req.libsonnet
index 8697f8af3b..e5d9753507 100644
--- a/compute/etc/sql_exporter/checkpoints_req.libsonnet
+++ b/compute/etc/sql_exporter/checkpoints_req.libsonnet
@@ -1,3 +1,8 @@
+local neon = import 'neon.libsonnet';
+
+local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_req.sql';
+local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_req.17.sql';
+
 {
   metric_name: 'checkpoints_req',
   type: 'gauge',
@@ -6,5 +11,5 @@
   values: [
     'checkpoints_req',
   ],
-  query: importstr 'sql_exporter/checkpoints_req.sql',
+  query: if neon.PG_MAJORVERSION_NUM < 17 then pg_stat_bgwriter else pg_stat_checkpointer,
 }
diff --git a/compute/etc/sql_exporter/checkpoints_timed.17.sql b/compute/etc/sql_exporter/checkpoints_timed.17.sql
new file mode 100644
index 0000000000..0d86ddb3ea
--- /dev/null
+++ b/compute/etc/sql_exporter/checkpoints_timed.17.sql
@@ -0,0 +1 @@
+SELECT num_timed AS checkpoints_timed FROM pg_stat_checkpointer;
diff --git a/compute/etc/sql_exporter/checkpoints_timed.libsonnet b/compute/etc/sql_exporter/checkpoints_timed.libsonnet
index 9f0b742400..0ba0080188 100644
--- a/compute/etc/sql_exporter/checkpoints_timed.libsonnet
+++ b/compute/etc/sql_exporter/checkpoints_timed.libsonnet
@@ -1,3 +1,8 @@
+local neon = import 'neon.libsonnet';
+
+local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_req.sql';
+local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_req.17.sql';
+
 {
   metric_name: 'checkpoints_timed',
   type: 'gauge',
@@ -6,5 +11,5 @@
   values: [
     'checkpoints_timed',
   ],
-  query: importstr 'sql_exporter/checkpoints_timed.sql',
+  query: if neon.PG_MAJORVERSION_NUM < 17 then pg_stat_bgwriter else pg_stat_checkpointer,
 }
diff --git a/compute/jsonnet/neon.libsonnet b/compute/jsonnet/neon.libsonnet
new file mode 100644
index 0000000000..583b631c58
--- /dev/null
+++ b/compute/jsonnet/neon.libsonnet
@@ -0,0 +1,16 @@
+local MIN_SUPPORTED_VERSION = 14;
+local MAX_SUPPORTED_VERSION = 17;
+local SUPPORTED_VERSIONS = std.range(MIN_SUPPORTED_VERSION, MAX_SUPPORTED_VERSION);
+
+# If we receive the pg_version with a leading "v", ditch it.
+local pg_version = std.strReplace(std.extVar('pg_version'), 'v', '');
+local pg_version_num = std.parseInt(pg_version);
+
+assert std.setMember(pg_version_num, SUPPORTED_VERSIONS) :
+       std.format('%s is an unsupported Postgres version: %s',
+                  [pg_version, std.toString(SUPPORTED_VERSIONS)]);
+
+{
+  PG_MAJORVERSION: pg_version,
+  PG_MAJORVERSION_NUM: pg_version_num,
+}

From 67d5d98b1960c7f7b88d1f9860cd9672411cb815 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 16 Oct 2024 21:47:53 +0200
Subject: [PATCH 32/57] readme: fix build instructions for debian 12 (#9371)

We need libprotobuf-dev for some of the
`/usr/include/google/protobuf/...*.proto`
referenced by our protobuf decls.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cfc63b4708..e68ef70bdf 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,7 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
 libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
-libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev
+libprotobuf-dev libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev
 ```
 * On Fedora, these packages are needed:
 ```bash

From 934dbb61f557477512b3cf5c98e9930e5745d87e Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 17 Oct 2024 08:04:57 +0300
Subject: [PATCH 33/57] Check access_count in lfc_evict (#9407)

## Problem

See
https://neondb.slack.com/archives/C033A2WE6BZ/p1729007738526309?thread_ts=1722942856.987979&cid=C033A2WE6BZ

When replica receives WAL record which target page is not present in
shared buffer, we evict this page from LFC.
If all pages from the LFC chunk are evicted, then chunk is moved to the
beginning of LRU least to force it reuse.
Unfortunately access_count is not checked and if the entry is access at
this moment then this operation can cause LRU list corruption.

## Summary of changes

Check `access_count` in `lfc_evict`

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c | 45 ++++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index bbea5a8b0d..70b250d394 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -617,31 +617,34 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	/* remove the page from the cache */
 	entry->bitmap[chunk_offs >> 5] &= ~(1 << (chunk_offs & (32 - 1)));
 
-	/*
-	 * If the chunk has no live entries, we can position the chunk to be
-	 * recycled first.
-	 */
-	if (entry->bitmap[chunk_offs >> 5] == 0)
+	if (entry->access_count == 0)
 	{
-		bool		has_remaining_pages = false;
-
-		for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
-		{
-			if (entry->bitmap[i] != 0)
-			{
-				has_remaining_pages = true;
-				break;
-			}
-		}
-
 		/*
-		 * Put the entry at the position that is first to be reclaimed when we
-		 * have no cached pages remaining in the chunk
+		 * If the chunk has no live entries, we can position the chunk to be
+		 * recycled first.
 		 */
-		if (!has_remaining_pages)
+		if (entry->bitmap[chunk_offs >> 5] == 0)
 		{
-			dlist_delete(&entry->list_node);
-			dlist_push_head(&lfc_ctl->lru, &entry->list_node);
+			bool		has_remaining_pages = false;
+
+			for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
+			{
+				if (entry->bitmap[i] != 0)
+				{
+					has_remaining_pages = true;
+					break;
+				}
+			}
+
+			/*
+			 * Put the entry at the position that is first to be reclaimed when we
+			 * have no cached pages remaining in the chunk
+			 */
+			if (!has_remaining_pages)
+			{
+				dlist_delete(&entry->list_node);
+				dlist_push_head(&lfc_ctl->lru, &entry->list_node);
+			}
 		}
 	}
 

From db68e822355a4ef8ac9e3363d90bb9a2bd0e6dad Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 17 Oct 2024 10:06:02 +0100
Subject: [PATCH 34/57] storage_scrubber: fixes to garbage commands (#9409)

## Problem

While running `find-garbage` and `purge-garbage`, I encountered two
things that needed updating:
- Console API may omit `user_id` since org accounts were added
- When we cut over to using GenericRemoteStorage, the object listings we
do during purge did not get proper retry handling, so could easily fail
on usual S3 errors, and make the whole process drop out.

...and one bug:
- We had a `.unwrap` which expects that after finding an object in a
tenant path, a listing in that path will always return objects. This is
not true, because a pageserver might be deleting the path at the same
time as we scan it.

## Summary of changes

- When listing objects during purge, use backoff::retry
- Make `user_id` an `Option`
- Handle the case where a tenant's objects go away during find-garbage.
---
 storage_scrubber/src/cloud_admin_api.rs |  2 +-
 storage_scrubber/src/garbage.rs         | 65 ++++++++++++++++---------
 2 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/storage_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs
index 70b108cf23..7b82a0b116 100644
--- a/storage_scrubber/src/cloud_admin_api.rs
+++ b/storage_scrubber/src/cloud_admin_api.rs
@@ -138,7 +138,7 @@ pub struct ProjectData {
     pub name: String,
     pub region_id: String,
     pub platform_id: String,
-    pub user_id: String,
+    pub user_id: Option<String>,
     pub pageserver_id: Option<u64>,
     #[serde(deserialize_with = "from_nullable_id")]
     pub tenant: TenantId,
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index d53611ed6e..a0040ada08 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -16,13 +16,13 @@ use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePat
 use serde::{Deserialize, Serialize};
 use tokio_stream::StreamExt;
 use tokio_util::sync::CancellationToken;
-use utils::id::TenantId;
+use utils::{backoff, id::TenantId};
 
 use crate::{
     cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
     init_remote, list_objects_with_retries,
     metadata_stream::{stream_tenant_timelines, stream_tenants},
-    BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
+    BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, MAX_RETRIES,
 };
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -250,13 +250,16 @@ async fn find_garbage_inner(
                     &target.tenant_root(&tenant_shard_id),
                 )
                 .await?;
-                let object = tenant_objects.keys.first().unwrap();
-                if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
-                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
-                    garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
-                    continue;
+                if let Some(object) = tenant_objects.keys.first() {
+                    if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
+                        tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
+                        garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
+                        continue;
+                    } else {
+                        tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
+                    }
                 } else {
-                    tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
+                    tracing::info!("Tenant {tenant_shard_id} is missing in console appears to have been deleted while we ran");
                 }
             } else {
                 // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
@@ -406,14 +409,17 @@ pub async fn get_tenant_objects(
     // TODO: apply extra validation based on object modification time.  Don't purge
     // tenants where any timeline's index_part.json has been touched recently.
 
-    let list = s3_client
-        .list(
-            Some(&tenant_root),
-            ListingMode::NoDelimiter,
-            None,
-            &CancellationToken::new(),
-        )
-        .await?;
+    let cancel = CancellationToken::new();
+    let list = backoff::retry(
+        || s3_client.list(Some(&tenant_root), ListingMode::NoDelimiter, None, &cancel),
+        |_| false,
+        3,
+        MAX_RETRIES as u32,
+        "get_tenant_objects",
+        &cancel,
+    )
+    .await
+    .expect("dummy cancellation token")?;
     Ok(list.keys)
 }
 
@@ -424,14 +430,25 @@ pub async fn get_timeline_objects(
     tracing::debug!("Listing objects in timeline {ttid}");
     let timeline_root = super::remote_timeline_path_id(&ttid);
 
-    let list = s3_client
-        .list(
-            Some(&timeline_root),
-            ListingMode::NoDelimiter,
-            None,
-            &CancellationToken::new(),
-        )
-        .await?;
+    let cancel = CancellationToken::new();
+    let list = backoff::retry(
+        || {
+            s3_client.list(
+                Some(&timeline_root),
+                ListingMode::NoDelimiter,
+                None,
+                &cancel,
+            )
+        },
+        |_| false,
+        3,
+        MAX_RETRIES as u32,
+        "get_timeline_objects",
+        &cancel,
+    )
+    .await
+    .expect("dummy cancellation token")?;
+
     Ok(list.keys)
 }
 

From 22d8834474d1f619b6ed351fd80033b4a064bb21 Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Thu, 17 Oct 2024 13:38:24 +0300
Subject: [PATCH 35/57] proxy: move the connection pools to separate file
 (#9398)

First PR for #9284
Start unification of the client and connection pool interfaces:
- Exclude the 'global_connections_count' out from the get_conn_entry()
- Move remote connection pools to the conn_pool_lib as a reference
- Unify clients among all the conn pools
---
 proxy/src/serverless/backend.rs         |  13 +-
 proxy/src/serverless/conn_pool.rs       | 593 ++----------------------
 proxy/src/serverless/conn_pool_lib.rs   | 562 ++++++++++++++++++++++
 proxy/src/serverless/http_conn_pool.rs  |  50 +-
 proxy/src/serverless/local_conn_pool.rs | 111 ++---
 proxy/src/serverless/mod.rs             |   5 +-
 proxy/src/serverless/sql_over_http.rs   |  15 +-
 7 files changed, 709 insertions(+), 640 deletions(-)
 create mode 100644 proxy/src/serverless/conn_pool_lib.rs

diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index a180c4c2ed..82e81dbcfe 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -11,8 +11,9 @@ use tokio::net::{lookup_host, TcpStream};
 use tracing::field::display;
 use tracing::{debug, info};
 
-use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
-use super::http_conn_pool::{self, poll_http2_client};
+use super::conn_pool::poll_client;
+use super::conn_pool_lib::{Client, ConnInfo, GlobalConnPool};
+use super::http_conn_pool::{self, poll_http2_client, Send};
 use super::local_conn_pool::{self, LocalClient, LocalConnPool};
 use crate::auth::backend::local::StaticAuthRules;
 use crate::auth::backend::{ComputeCredentials, ComputeUserInfo};
@@ -31,7 +32,7 @@ use crate::rate_limiter::EndpointRateLimiter;
 use crate::{compute, EndpointId, Host};
 
 pub(crate) struct PoolingBackend {
-    pub(crate) http_conn_pool: Arc<super::http_conn_pool::GlobalConnPool>,
+    pub(crate) http_conn_pool: Arc<super::http_conn_pool::GlobalConnPool<Send>>,
     pub(crate) local_pool: Arc<LocalConnPool<tokio_postgres::Client>>,
     pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
     pub(crate) config: &'static ProxyConfig,
@@ -199,7 +200,7 @@ impl PoolingBackend {
         &self,
         ctx: &RequestMonitoring,
         conn_info: ConnInfo,
-    ) -> Result<http_conn_pool::Client, HttpConnError> {
+    ) -> Result<http_conn_pool::Client<Send>, HttpConnError> {
         info!("pool: looking for an existing connection");
         if let Some(client) = self.http_conn_pool.get(ctx, &conn_info) {
             return Ok(client);
@@ -481,7 +482,7 @@ impl ConnectMechanism for TokioMechanism {
 }
 
 struct HyperMechanism {
-    pool: Arc<http_conn_pool::GlobalConnPool>,
+    pool: Arc<http_conn_pool::GlobalConnPool<Send>>,
     conn_info: ConnInfo,
     conn_id: uuid::Uuid,
 
@@ -491,7 +492,7 @@ struct HyperMechanism {
 
 #[async_trait]
 impl ConnectMechanism for HyperMechanism {
-    type Connection = http_conn_pool::Client;
+    type Connection = http_conn_pool::Client<Send>;
     type ConnectError = HttpConnError;
     type Error = HttpConnError;
 
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index aa869ff1c0..b97c656510 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -1,31 +1,29 @@
-use std::collections::HashMap;
 use std::fmt;
-use std::ops::Deref;
 use std::pin::pin;
-use std::sync::atomic::{self, AtomicUsize};
 use std::sync::{Arc, Weak};
 use std::task::{ready, Poll};
-use std::time::Duration;
 
-use dashmap::DashMap;
 use futures::future::poll_fn;
 use futures::Future;
-use parking_lot::RwLock;
-use rand::Rng;
 use smallvec::SmallVec;
 use tokio::time::Instant;
 use tokio_postgres::tls::NoTlsStream;
-use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
+use tokio_postgres::{AsyncMessage, Socket};
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, info_span, warn, Instrument, Span};
+use tracing::{error, info, info_span, warn, Instrument};
 
-use super::backend::HttpConnError;
-use crate::auth::backend::ComputeUserInfo;
 use crate::context::RequestMonitoring;
-use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
-use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
-use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-use crate::{DbName, EndpointCacheKey, RoleName};
+use crate::control_plane::messages::MetricsAuxInfo;
+use crate::metrics::Metrics;
+
+use super::conn_pool_lib::{Client, ClientInnerExt, ConnInfo, GlobalConnPool};
+
+#[cfg(test)]
+use {
+    super::conn_pool_lib::GlobalConnPoolOptions,
+    crate::auth::backend::ComputeUserInfo,
+    std::{sync::atomic, time::Duration},
+};
 
 #[derive(Debug, Clone)]
 pub(crate) struct ConnInfoWithAuth {
@@ -33,34 +31,12 @@ pub(crate) struct ConnInfoWithAuth {
     pub(crate) auth: AuthData,
 }
 
-#[derive(Debug, Clone)]
-pub(crate) struct ConnInfo {
-    pub(crate) user_info: ComputeUserInfo,
-    pub(crate) dbname: DbName,
-}
-
 #[derive(Debug, Clone)]
 pub(crate) enum AuthData {
     Password(SmallVec<[u8; 16]>),
     Jwt(String),
 }
 
-impl ConnInfo {
-    // hm, change to hasher to avoid cloning?
-    pub(crate) fn db_and_user(&self) -> (DbName, RoleName) {
-        (self.dbname.clone(), self.user_info.user.clone())
-    }
-
-    pub(crate) fn endpoint_cache_key(&self) -> Option<EndpointCacheKey> {
-        // We don't want to cache http connections for ephemeral endpoints.
-        if self.user_info.options.is_ephemeral() {
-            None
-        } else {
-            Some(self.user_info.endpoint_cache_key())
-        }
-    }
-}
-
 impl fmt::Display for ConnInfo {
     // use custom display to avoid logging password
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
@@ -75,402 +51,6 @@ impl fmt::Display for ConnInfo {
     }
 }
 
-struct ConnPoolEntry<C: ClientInnerExt> {
-    conn: ClientInner<C>,
-    _last_access: std::time::Instant,
-}
-
-// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
-// Number of open connections is limited by the `max_conns_per_endpoint`.
-pub(crate) struct EndpointConnPool<C: ClientInnerExt> {
-    pools: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
-    total_conns: usize,
-    max_conns: usize,
-    _guard: HttpEndpointPoolsGuard<'static>,
-    global_connections_count: Arc<AtomicUsize>,
-    global_pool_size_max_conns: usize,
-}
-
-impl<C: ClientInnerExt> EndpointConnPool<C> {
-    fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry<C>> {
-        let Self {
-            pools,
-            total_conns,
-            global_connections_count,
-            ..
-        } = self;
-        pools.get_mut(&db_user).and_then(|pool_entries| {
-            pool_entries.get_conn_entry(total_conns, global_connections_count.clone())
-        })
-    }
-
-    fn remove_client(&mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid) -> bool {
-        let Self {
-            pools,
-            total_conns,
-            global_connections_count,
-            ..
-        } = self;
-        if let Some(pool) = pools.get_mut(&db_user) {
-            let old_len = pool.conns.len();
-            pool.conns.retain(|conn| conn.conn.conn_id != conn_id);
-            let new_len = pool.conns.len();
-            let removed = old_len - new_len;
-            if removed > 0 {
-                global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
-                Metrics::get()
-                    .proxy
-                    .http_pool_opened_connections
-                    .get_metric()
-                    .dec_by(removed as i64);
-            }
-            *total_conns -= removed;
-            removed > 0
-        } else {
-            false
-        }
-    }
-
-    fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInner<C>) {
-        let conn_id = client.conn_id;
-
-        if client.is_closed() {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
-            return;
-        }
-        let global_max_conn = pool.read().global_pool_size_max_conns;
-        if pool
-            .read()
-            .global_connections_count
-            .load(atomic::Ordering::Relaxed)
-            >= global_max_conn
-        {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full");
-            return;
-        }
-
-        // return connection to the pool
-        let mut returned = false;
-        let mut per_db_size = 0;
-        let total_conns = {
-            let mut pool = pool.write();
-
-            if pool.total_conns < pool.max_conns {
-                let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default();
-                pool_entries.conns.push(ConnPoolEntry {
-                    conn: client,
-                    _last_access: std::time::Instant::now(),
-                });
-
-                returned = true;
-                per_db_size = pool_entries.conns.len();
-
-                pool.total_conns += 1;
-                pool.global_connections_count
-                    .fetch_add(1, atomic::Ordering::Relaxed);
-                Metrics::get()
-                    .proxy
-                    .http_pool_opened_connections
-                    .get_metric()
-                    .inc();
-            }
-
-            pool.total_conns
-        };
-
-        // do logging outside of the mutex
-        if returned {
-            info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
-        } else {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
-        }
-    }
-}
-
-impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
-    fn drop(&mut self) {
-        if self.total_conns > 0 {
-            self.global_connections_count
-                .fetch_sub(self.total_conns, atomic::Ordering::Relaxed);
-            Metrics::get()
-                .proxy
-                .http_pool_opened_connections
-                .get_metric()
-                .dec_by(self.total_conns as i64);
-        }
-    }
-}
-
-pub(crate) struct DbUserConnPool<C: ClientInnerExt> {
-    conns: Vec<ConnPoolEntry<C>>,
-}
-
-impl<C: ClientInnerExt> Default for DbUserConnPool<C> {
-    fn default() -> Self {
-        Self { conns: Vec::new() }
-    }
-}
-
-impl<C: ClientInnerExt> DbUserConnPool<C> {
-    fn clear_closed_clients(&mut self, conns: &mut usize) -> usize {
-        let old_len = self.conns.len();
-
-        self.conns.retain(|conn| !conn.conn.is_closed());
-
-        let new_len = self.conns.len();
-        let removed = old_len - new_len;
-        *conns -= removed;
-        removed
-    }
-
-    fn get_conn_entry(
-        &mut self,
-        conns: &mut usize,
-        global_connections_count: Arc<AtomicUsize>,
-    ) -> Option<ConnPoolEntry<C>> {
-        let mut removed = self.clear_closed_clients(conns);
-        let conn = self.conns.pop();
-        if conn.is_some() {
-            *conns -= 1;
-            removed += 1;
-        }
-        global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
-        Metrics::get()
-            .proxy
-            .http_pool_opened_connections
-            .get_metric()
-            .dec_by(removed as i64);
-        conn
-    }
-}
-
-pub(crate) struct GlobalConnPool<C: ClientInnerExt> {
-    // endpoint -> per-endpoint connection pool
-    //
-    // That should be a fairly conteded map, so return reference to the per-endpoint
-    // pool as early as possible and release the lock.
-    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool<C>>>>,
-
-    /// Number of endpoint-connection pools
-    ///
-    /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
-    /// That seems like far too much effort, so we're using a relaxed increment counter instead.
-    /// It's only used for diagnostics.
-    global_pool_size: AtomicUsize,
-
-    /// Total number of connections in the pool
-    global_connections_count: Arc<AtomicUsize>,
-
-    config: &'static crate::config::HttpConfig,
-}
-
-#[derive(Debug, Clone, Copy)]
-pub struct GlobalConnPoolOptions {
-    // Maximum number of connections per one endpoint.
-    // Can mix different (dbname, username) connections.
-    // When running out of free slots for a particular endpoint,
-    // falls back to opening a new connection for each request.
-    pub max_conns_per_endpoint: usize,
-
-    pub gc_epoch: Duration,
-
-    pub pool_shards: usize,
-
-    pub idle_timeout: Duration,
-
-    pub opt_in: bool,
-
-    // Total number of connections in the pool.
-    pub max_total_conns: usize,
-}
-
-impl<C: ClientInnerExt> GlobalConnPool<C> {
-    pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
-        let shards = config.pool_options.pool_shards;
-        Arc::new(Self {
-            global_pool: DashMap::with_shard_amount(shards),
-            global_pool_size: AtomicUsize::new(0),
-            config,
-            global_connections_count: Arc::new(AtomicUsize::new(0)),
-        })
-    }
-
-    #[cfg(test)]
-    pub(crate) fn get_global_connections_count(&self) -> usize {
-        self.global_connections_count
-            .load(atomic::Ordering::Relaxed)
-    }
-
-    pub(crate) fn get_idle_timeout(&self) -> Duration {
-        self.config.pool_options.idle_timeout
-    }
-
-    pub(crate) fn shutdown(&self) {
-        // drops all strong references to endpoint-pools
-        self.global_pool.clear();
-    }
-
-    pub(crate) async fn gc_worker(&self, mut rng: impl Rng) {
-        let epoch = self.config.pool_options.gc_epoch;
-        let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
-        loop {
-            interval.tick().await;
-
-            let shard = rng.gen_range(0..self.global_pool.shards().len());
-            self.gc(shard);
-        }
-    }
-
-    fn gc(&self, shard: usize) {
-        debug!(shard, "pool: performing epoch reclamation");
-
-        // acquire a random shard lock
-        let mut shard = self.global_pool.shards()[shard].write();
-
-        let timer = Metrics::get()
-            .proxy
-            .http_pool_reclaimation_lag_seconds
-            .start_timer();
-        let current_len = shard.len();
-        let mut clients_removed = 0;
-        shard.retain(|endpoint, x| {
-            // if the current endpoint pool is unique (no other strong or weak references)
-            // then it is currently not in use by any connections.
-            if let Some(pool) = Arc::get_mut(x.get_mut()) {
-                let EndpointConnPool {
-                    pools, total_conns, ..
-                } = pool.get_mut();
-
-                // ensure that closed clients are removed
-                for db_pool in pools.values_mut() {
-                    clients_removed += db_pool.clear_closed_clients(total_conns);
-                }
-
-                // we only remove this pool if it has no active connections
-                if *total_conns == 0 {
-                    info!("pool: discarding pool for endpoint {endpoint}");
-                    return false;
-                }
-            }
-
-            true
-        });
-
-        let new_len = shard.len();
-        drop(shard);
-        timer.observe();
-
-        // Do logging outside of the lock.
-        if clients_removed > 0 {
-            let size = self
-                .global_connections_count
-                .fetch_sub(clients_removed, atomic::Ordering::Relaxed)
-                - clients_removed;
-            Metrics::get()
-                .proxy
-                .http_pool_opened_connections
-                .get_metric()
-                .dec_by(clients_removed as i64);
-            info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}");
-        }
-        let removed = current_len - new_len;
-
-        if removed > 0 {
-            let global_pool_size = self
-                .global_pool_size
-                .fetch_sub(removed, atomic::Ordering::Relaxed)
-                - removed;
-            info!("pool: performed global pool gc. size now {global_pool_size}");
-        }
-    }
-
-    pub(crate) fn get(
-        self: &Arc<Self>,
-        ctx: &RequestMonitoring,
-        conn_info: &ConnInfo,
-    ) -> Result<Option<Client<C>>, HttpConnError> {
-        let mut client: Option<ClientInner<C>> = None;
-        let Some(endpoint) = conn_info.endpoint_cache_key() else {
-            return Ok(None);
-        };
-
-        let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
-        if let Some(entry) = endpoint_pool
-            .write()
-            .get_conn_entry(conn_info.db_and_user())
-        {
-            client = Some(entry.conn);
-        }
-        let endpoint_pool = Arc::downgrade(&endpoint_pool);
-
-        // ok return cached connection if found and establish a new one otherwise
-        if let Some(client) = client {
-            if client.is_closed() {
-                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
-                return Ok(None);
-            }
-            tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
-            tracing::Span::current().record(
-                "pid",
-                tracing::field::display(client.inner.get_process_id()),
-            );
-            info!(
-                cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
-                "pool: reusing connection '{conn_info}'"
-            );
-            client.session.send(ctx.session_id())?;
-            ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
-            ctx.success();
-            return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
-        }
-        Ok(None)
-    }
-
-    fn get_or_create_endpoint_pool(
-        self: &Arc<Self>,
-        endpoint: &EndpointCacheKey,
-    ) -> Arc<RwLock<EndpointConnPool<C>>> {
-        // fast path
-        if let Some(pool) = self.global_pool.get(endpoint) {
-            return pool.clone();
-        }
-
-        // slow path
-        let new_pool = Arc::new(RwLock::new(EndpointConnPool {
-            pools: HashMap::new(),
-            total_conns: 0,
-            max_conns: self.config.pool_options.max_conns_per_endpoint,
-            _guard: Metrics::get().proxy.http_endpoint_pools.guard(),
-            global_connections_count: self.global_connections_count.clone(),
-            global_pool_size_max_conns: self.config.pool_options.max_total_conns,
-        }));
-
-        // find or create a pool for this endpoint
-        let mut created = false;
-        let pool = self
-            .global_pool
-            .entry(endpoint.clone())
-            .or_insert_with(|| {
-                created = true;
-                new_pool
-            })
-            .clone();
-
-        // log new global pool size
-        if created {
-            let global_pool_size = self
-                .global_pool_size
-                .fetch_add(1, atomic::Ordering::Relaxed)
-                + 1;
-            info!(
-                "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}"
-            );
-        }
-
-        pool
-    }
-}
-
 pub(crate) fn poll_client<C: ClientInnerExt>(
     global_pool: Arc<GlobalConnPool<C>>,
     ctx: &RequestMonitoring,
@@ -574,7 +154,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
 
     }
     .instrument(span));
-    let inner = ClientInner {
+    let inner = ClientInnerRemote {
         inner: client,
         session: tx,
         cancel,
@@ -584,7 +164,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
     Client::new(inner, conn_info, pool_clone)
 }
 
-struct ClientInner<C: ClientInnerExt> {
+pub(crate) struct ClientInnerRemote<C: ClientInnerExt> {
     inner: C,
     session: tokio::sync::watch::Sender<uuid::Uuid>,
     cancel: CancellationToken,
@@ -592,131 +172,36 @@ struct ClientInner<C: ClientInnerExt> {
     conn_id: uuid::Uuid,
 }
 
-impl<C: ClientInnerExt> Drop for ClientInner<C> {
-    fn drop(&mut self) {
-        // on client drop, tell the conn to shut down
-        self.cancel.cancel();
+impl<C: ClientInnerExt> ClientInnerRemote<C> {
+    pub(crate) fn inner_mut(&mut self) -> &mut C {
+        &mut self.inner
     }
-}
 
-pub(crate) trait ClientInnerExt: Sync + Send + 'static {
-    fn is_closed(&self) -> bool;
-    fn get_process_id(&self) -> i32;
-}
-
-impl ClientInnerExt for tokio_postgres::Client {
-    fn is_closed(&self) -> bool {
-        self.is_closed()
+    pub(crate) fn inner(&self) -> &C {
+        &self.inner
+    }
+
+    pub(crate) fn session(&mut self) -> &mut tokio::sync::watch::Sender<uuid::Uuid> {
+        &mut self.session
+    }
+
+    pub(crate) fn aux(&self) -> &MetricsAuxInfo {
+        &self.aux
+    }
+
+    pub(crate) fn get_conn_id(&self) -> uuid::Uuid {
+        self.conn_id
     }
-    fn get_process_id(&self) -> i32 {
-        self.get_process_id()
-    }
-}
 
-impl<C: ClientInnerExt> ClientInner<C> {
     pub(crate) fn is_closed(&self) -> bool {
         self.inner.is_closed()
     }
 }
 
-impl<C: ClientInnerExt> Client<C> {
-    pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
-        let aux = &self.inner.as_ref().unwrap().aux;
-        USAGE_METRICS.register(Ids {
-            endpoint_id: aux.endpoint_id,
-            branch_id: aux.branch_id,
-        })
-    }
-}
-
-pub(crate) struct Client<C: ClientInnerExt> {
-    span: Span,
-    inner: Option<ClientInner<C>>,
-    conn_info: ConnInfo,
-    pool: Weak<RwLock<EndpointConnPool<C>>>,
-}
-
-pub(crate) struct Discard<'a, C: ClientInnerExt> {
-    conn_info: &'a ConnInfo,
-    pool: &'a mut Weak<RwLock<EndpointConnPool<C>>>,
-}
-
-impl<C: ClientInnerExt> Client<C> {
-    pub(self) fn new(
-        inner: ClientInner<C>,
-        conn_info: ConnInfo,
-        pool: Weak<RwLock<EndpointConnPool<C>>>,
-    ) -> Self {
-        Self {
-            inner: Some(inner),
-            span: Span::current(),
-            conn_info,
-            pool,
-        }
-    }
-    pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
-        let Self {
-            inner,
-            pool,
-            conn_info,
-            span: _,
-        } = self;
-        let inner = inner.as_mut().expect("client inner should not be removed");
-        (&mut inner.inner, Discard { conn_info, pool })
-    }
-}
-
-impl<C: ClientInnerExt> Discard<'_, C> {
-    pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) {
-        let conn_info = &self.conn_info;
-        if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
-            info!("pool: throwing away connection '{conn_info}' because connection is not idle");
-        }
-    }
-    pub(crate) fn discard(&mut self) {
-        let conn_info = &self.conn_info;
-        if std::mem::take(self.pool).strong_count() > 0 {
-            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
-        }
-    }
-}
-
-impl<C: ClientInnerExt> Deref for Client<C> {
-    type Target = C;
-
-    fn deref(&self) -> &Self::Target {
-        &self
-            .inner
-            .as_ref()
-            .expect("client inner should not be removed")
-            .inner
-    }
-}
-
-impl<C: ClientInnerExt> Client<C> {
-    fn do_drop(&mut self) -> Option<impl FnOnce()> {
-        let conn_info = self.conn_info.clone();
-        let client = self
-            .inner
-            .take()
-            .expect("client inner should not be removed");
-        if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() {
-            let current_span = self.span.clone();
-            // return connection to the pool
-            return Some(move || {
-                let _span = current_span.enter();
-                EndpointConnPool::put(&conn_pool, &conn_info, client);
-            });
-        }
-        None
-    }
-}
-
-impl<C: ClientInnerExt> Drop for Client<C> {
+impl<C: ClientInnerExt> Drop for ClientInnerRemote<C> {
     fn drop(&mut self) {
-        if let Some(drop) = self.do_drop() {
-            tokio::task::spawn_blocking(drop);
-        }
+        // on client drop, tell the conn to shut down
+        self.cancel.cancel();
     }
 }
 
@@ -745,12 +230,12 @@ mod tests {
         }
     }
 
-    fn create_inner() -> ClientInner<MockClient> {
+    fn create_inner() -> ClientInnerRemote<MockClient> {
         create_inner_with(MockClient::new(false))
     }
 
-    fn create_inner_with(client: MockClient) -> ClientInner<MockClient> {
-        ClientInner {
+    fn create_inner_with(client: MockClient) -> ClientInnerRemote<MockClient> {
+        ClientInnerRemote {
             inner: client,
             session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
             cancel: CancellationToken::new(),
@@ -797,7 +282,7 @@ mod tests {
         {
             let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
             assert_eq!(0, pool.get_global_connections_count());
-            client.inner().1.discard();
+            client.inner_mut().1.discard();
             // Discard should not add the connection from the pool.
             assert_eq!(0, pool.get_global_connections_count());
         }
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
new file mode 100644
index 0000000000..6e964ce878
--- /dev/null
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -0,0 +1,562 @@
+use dashmap::DashMap;
+use parking_lot::RwLock;
+use rand::Rng;
+use std::{collections::HashMap, sync::Arc, sync::Weak, time::Duration};
+use std::{
+    ops::Deref,
+    sync::atomic::{self, AtomicUsize},
+};
+use tokio_postgres::ReadyForQueryStatus;
+
+use crate::control_plane::messages::ColdStartInfo;
+use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
+use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
+use crate::{
+    auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName,
+};
+
+use super::conn_pool::ClientInnerRemote;
+use tracing::info;
+use tracing::{debug, Span};
+
+use super::backend::HttpConnError;
+
+#[derive(Debug, Clone)]
+pub(crate) struct ConnInfo {
+    pub(crate) user_info: ComputeUserInfo,
+    pub(crate) dbname: DbName,
+}
+
+impl ConnInfo {
+    // hm, change to hasher to avoid cloning?
+    pub(crate) fn db_and_user(&self) -> (DbName, RoleName) {
+        (self.dbname.clone(), self.user_info.user.clone())
+    }
+
+    pub(crate) fn endpoint_cache_key(&self) -> Option<EndpointCacheKey> {
+        // We don't want to cache http connections for ephemeral endpoints.
+        if self.user_info.options.is_ephemeral() {
+            None
+        } else {
+            Some(self.user_info.endpoint_cache_key())
+        }
+    }
+}
+
+pub(crate) struct ConnPoolEntry<C: ClientInnerExt> {
+    pub(crate) conn: ClientInnerRemote<C>,
+    pub(crate) _last_access: std::time::Instant,
+}
+
+// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
+// Number of open connections is limited by the `max_conns_per_endpoint`.
+pub(crate) struct EndpointConnPool<C: ClientInnerExt> {
+    pools: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
+    total_conns: usize,
+    max_conns: usize,
+    _guard: HttpEndpointPoolsGuard<'static>,
+    global_connections_count: Arc<AtomicUsize>,
+    global_pool_size_max_conns: usize,
+}
+
+impl<C: ClientInnerExt> EndpointConnPool<C> {
+    fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry<C>> {
+        let Self {
+            pools,
+            total_conns,
+            global_connections_count,
+            ..
+        } = self;
+        pools.get_mut(&db_user).and_then(|pool_entries| {
+            let (entry, removed) = pool_entries.get_conn_entry(total_conns);
+            global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
+            entry
+        })
+    }
+
+    pub(crate) fn remove_client(
+        &mut self,
+        db_user: (DbName, RoleName),
+        conn_id: uuid::Uuid,
+    ) -> bool {
+        let Self {
+            pools,
+            total_conns,
+            global_connections_count,
+            ..
+        } = self;
+        if let Some(pool) = pools.get_mut(&db_user) {
+            let old_len = pool.conns.len();
+            pool.conns.retain(|conn| conn.conn.get_conn_id() != conn_id);
+            let new_len = pool.conns.len();
+            let removed = old_len - new_len;
+            if removed > 0 {
+                global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
+                Metrics::get()
+                    .proxy
+                    .http_pool_opened_connections
+                    .get_metric()
+                    .dec_by(removed as i64);
+            }
+            *total_conns -= removed;
+            removed > 0
+        } else {
+            false
+        }
+    }
+
+    pub(crate) fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInnerRemote<C>) {
+        let conn_id = client.get_conn_id();
+
+        if client.is_closed() {
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
+            return;
+        }
+
+        let global_max_conn = pool.read().global_pool_size_max_conns;
+        if pool
+            .read()
+            .global_connections_count
+            .load(atomic::Ordering::Relaxed)
+            >= global_max_conn
+        {
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full");
+            return;
+        }
+
+        // return connection to the pool
+        let mut returned = false;
+        let mut per_db_size = 0;
+        let total_conns = {
+            let mut pool = pool.write();
+
+            if pool.total_conns < pool.max_conns {
+                let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default();
+                pool_entries.conns.push(ConnPoolEntry {
+                    conn: client,
+                    _last_access: std::time::Instant::now(),
+                });
+
+                returned = true;
+                per_db_size = pool_entries.conns.len();
+
+                pool.total_conns += 1;
+                pool.global_connections_count
+                    .fetch_add(1, atomic::Ordering::Relaxed);
+                Metrics::get()
+                    .proxy
+                    .http_pool_opened_connections
+                    .get_metric()
+                    .inc();
+            }
+
+            pool.total_conns
+        };
+
+        // do logging outside of the mutex
+        if returned {
+            info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
+        } else {
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
+        }
+    }
+}
+
+impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
+    fn drop(&mut self) {
+        if self.total_conns > 0 {
+            self.global_connections_count
+                .fetch_sub(self.total_conns, atomic::Ordering::Relaxed);
+            Metrics::get()
+                .proxy
+                .http_pool_opened_connections
+                .get_metric()
+                .dec_by(self.total_conns as i64);
+        }
+    }
+}
+
+pub(crate) struct DbUserConnPool<C: ClientInnerExt> {
+    pub(crate) conns: Vec<ConnPoolEntry<C>>,
+}
+
+impl<C: ClientInnerExt> Default for DbUserConnPool<C> {
+    fn default() -> Self {
+        Self { conns: Vec::new() }
+    }
+}
+
+impl<C: ClientInnerExt> DbUserConnPool<C> {
+    fn clear_closed_clients(&mut self, conns: &mut usize) -> usize {
+        let old_len = self.conns.len();
+
+        self.conns.retain(|conn| !conn.conn.is_closed());
+
+        let new_len = self.conns.len();
+        let removed = old_len - new_len;
+        *conns -= removed;
+        removed
+    }
+
+    pub(crate) fn get_conn_entry(
+        &mut self,
+        conns: &mut usize,
+    ) -> (Option<ConnPoolEntry<C>>, usize) {
+        let mut removed = self.clear_closed_clients(conns);
+        let conn = self.conns.pop();
+        if conn.is_some() {
+            *conns -= 1;
+            removed += 1;
+        }
+
+        Metrics::get()
+            .proxy
+            .http_pool_opened_connections
+            .get_metric()
+            .dec_by(removed as i64);
+
+        (conn, removed)
+    }
+}
+
+pub(crate) struct GlobalConnPool<C: ClientInnerExt> {
+    // endpoint -> per-endpoint connection pool
+    //
+    // That should be a fairly conteded map, so return reference to the per-endpoint
+    // pool as early as possible and release the lock.
+    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool<C>>>>,
+
+    /// Number of endpoint-connection pools
+    ///
+    /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
+    /// That seems like far too much effort, so we're using a relaxed increment counter instead.
+    /// It's only used for diagnostics.
+    global_pool_size: AtomicUsize,
+
+    /// Total number of connections in the pool
+    global_connections_count: Arc<AtomicUsize>,
+
+    config: &'static crate::config::HttpConfig,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct GlobalConnPoolOptions {
+    // Maximum number of connections per one endpoint.
+    // Can mix different (dbname, username) connections.
+    // When running out of free slots for a particular endpoint,
+    // falls back to opening a new connection for each request.
+    pub max_conns_per_endpoint: usize,
+
+    pub gc_epoch: Duration,
+
+    pub pool_shards: usize,
+
+    pub idle_timeout: Duration,
+
+    pub opt_in: bool,
+
+    // Total number of connections in the pool.
+    pub max_total_conns: usize,
+}
+
+impl<C: ClientInnerExt> GlobalConnPool<C> {
+    pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
+        let shards = config.pool_options.pool_shards;
+        Arc::new(Self {
+            global_pool: DashMap::with_shard_amount(shards),
+            global_pool_size: AtomicUsize::new(0),
+            config,
+            global_connections_count: Arc::new(AtomicUsize::new(0)),
+        })
+    }
+
+    #[cfg(test)]
+    pub(crate) fn get_global_connections_count(&self) -> usize {
+        self.global_connections_count
+            .load(atomic::Ordering::Relaxed)
+    }
+
+    pub(crate) fn get_idle_timeout(&self) -> Duration {
+        self.config.pool_options.idle_timeout
+    }
+
+    pub(crate) fn shutdown(&self) {
+        // drops all strong references to endpoint-pools
+        self.global_pool.clear();
+    }
+
+    pub(crate) async fn gc_worker(&self, mut rng: impl Rng) {
+        let epoch = self.config.pool_options.gc_epoch;
+        let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
+        loop {
+            interval.tick().await;
+
+            let shard = rng.gen_range(0..self.global_pool.shards().len());
+            self.gc(shard);
+        }
+    }
+
+    pub(crate) fn gc(&self, shard: usize) {
+        debug!(shard, "pool: performing epoch reclamation");
+
+        // acquire a random shard lock
+        let mut shard = self.global_pool.shards()[shard].write();
+
+        let timer = Metrics::get()
+            .proxy
+            .http_pool_reclaimation_lag_seconds
+            .start_timer();
+        let current_len = shard.len();
+        let mut clients_removed = 0;
+        shard.retain(|endpoint, x| {
+            // if the current endpoint pool is unique (no other strong or weak references)
+            // then it is currently not in use by any connections.
+            if let Some(pool) = Arc::get_mut(x.get_mut()) {
+                let EndpointConnPool {
+                    pools, total_conns, ..
+                } = pool.get_mut();
+
+                // ensure that closed clients are removed
+                for db_pool in pools.values_mut() {
+                    clients_removed += db_pool.clear_closed_clients(total_conns);
+                }
+
+                // we only remove this pool if it has no active connections
+                if *total_conns == 0 {
+                    info!("pool: discarding pool for endpoint {endpoint}");
+                    return false;
+                }
+            }
+
+            true
+        });
+
+        let new_len = shard.len();
+        drop(shard);
+        timer.observe();
+
+        // Do logging outside of the lock.
+        if clients_removed > 0 {
+            let size = self
+                .global_connections_count
+                .fetch_sub(clients_removed, atomic::Ordering::Relaxed)
+                - clients_removed;
+            Metrics::get()
+                .proxy
+                .http_pool_opened_connections
+                .get_metric()
+                .dec_by(clients_removed as i64);
+            info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}");
+        }
+        let removed = current_len - new_len;
+
+        if removed > 0 {
+            let global_pool_size = self
+                .global_pool_size
+                .fetch_sub(removed, atomic::Ordering::Relaxed)
+                - removed;
+            info!("pool: performed global pool gc. size now {global_pool_size}");
+        }
+    }
+
+    pub(crate) fn get_or_create_endpoint_pool(
+        self: &Arc<Self>,
+        endpoint: &EndpointCacheKey,
+    ) -> Arc<RwLock<EndpointConnPool<C>>> {
+        // fast path
+        if let Some(pool) = self.global_pool.get(endpoint) {
+            return pool.clone();
+        }
+
+        // slow path
+        let new_pool = Arc::new(RwLock::new(EndpointConnPool {
+            pools: HashMap::new(),
+            total_conns: 0,
+            max_conns: self.config.pool_options.max_conns_per_endpoint,
+            _guard: Metrics::get().proxy.http_endpoint_pools.guard(),
+            global_connections_count: self.global_connections_count.clone(),
+            global_pool_size_max_conns: self.config.pool_options.max_total_conns,
+        }));
+
+        // find or create a pool for this endpoint
+        let mut created = false;
+        let pool = self
+            .global_pool
+            .entry(endpoint.clone())
+            .or_insert_with(|| {
+                created = true;
+                new_pool
+            })
+            .clone();
+
+        // log new global pool size
+        if created {
+            let global_pool_size = self
+                .global_pool_size
+                .fetch_add(1, atomic::Ordering::Relaxed)
+                + 1;
+            info!(
+                "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}"
+            );
+        }
+
+        pool
+    }
+
+    pub(crate) fn get(
+        self: &Arc<Self>,
+        ctx: &RequestMonitoring,
+        conn_info: &ConnInfo,
+    ) -> Result<Option<Client<C>>, HttpConnError> {
+        let mut client: Option<ClientInnerRemote<C>> = None;
+        let Some(endpoint) = conn_info.endpoint_cache_key() else {
+            return Ok(None);
+        };
+
+        let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
+        if let Some(entry) = endpoint_pool
+            .write()
+            .get_conn_entry(conn_info.db_and_user())
+        {
+            client = Some(entry.conn);
+        }
+        let endpoint_pool = Arc::downgrade(&endpoint_pool);
+
+        // ok return cached connection if found and establish a new one otherwise
+        if let Some(mut client) = client {
+            if client.is_closed() {
+                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
+                return Ok(None);
+            }
+            tracing::Span::current()
+                .record("conn_id", tracing::field::display(client.get_conn_id()));
+            tracing::Span::current().record(
+                "pid",
+                tracing::field::display(client.inner().get_process_id()),
+            );
+            info!(
+                cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
+                "pool: reusing connection '{conn_info}'"
+            );
+
+            client.session().send(ctx.session_id())?;
+            ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
+            ctx.success();
+            return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
+        }
+        Ok(None)
+    }
+}
+
+impl<C: ClientInnerExt> Client<C> {
+    pub(crate) fn new(
+        inner: ClientInnerRemote<C>,
+        conn_info: ConnInfo,
+        pool: Weak<RwLock<EndpointConnPool<C>>>,
+    ) -> Self {
+        Self {
+            inner: Some(inner),
+            span: Span::current(),
+            conn_info,
+            pool,
+        }
+    }
+
+    pub(crate) fn inner_mut(&mut self) -> (&mut C, Discard<'_, C>) {
+        let Self {
+            inner,
+            pool,
+            conn_info,
+            span: _,
+        } = self;
+        let inner = inner.as_mut().expect("client inner should not be removed");
+        let inner_ref = inner.inner_mut();
+        (inner_ref, Discard { conn_info, pool })
+    }
+
+    pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
+        let aux = &self.inner.as_ref().unwrap().aux();
+        USAGE_METRICS.register(Ids {
+            endpoint_id: aux.endpoint_id,
+            branch_id: aux.branch_id,
+        })
+    }
+
+    pub(crate) fn do_drop(&mut self) -> Option<impl FnOnce()> {
+        let conn_info = self.conn_info.clone();
+        let client = self
+            .inner
+            .take()
+            .expect("client inner should not be removed");
+        if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() {
+            let current_span = self.span.clone();
+            // return connection to the pool
+            return Some(move || {
+                let _span = current_span.enter();
+                EndpointConnPool::put(&conn_pool, &conn_info, client);
+            });
+        }
+        None
+    }
+}
+
+pub(crate) struct Client<C: ClientInnerExt> {
+    span: Span,
+    inner: Option<ClientInnerRemote<C>>,
+    conn_info: ConnInfo,
+    pool: Weak<RwLock<EndpointConnPool<C>>>,
+}
+
+impl<C: ClientInnerExt> Drop for Client<C> {
+    fn drop(&mut self) {
+        if let Some(drop) = self.do_drop() {
+            tokio::task::spawn_blocking(drop);
+        }
+    }
+}
+
+impl<C: ClientInnerExt> Deref for Client<C> {
+    type Target = C;
+
+    fn deref(&self) -> &Self::Target {
+        self.inner
+            .as_ref()
+            .expect("client inner should not be removed")
+            .inner()
+    }
+}
+
+pub(crate) trait ClientInnerExt: Sync + Send + 'static {
+    fn is_closed(&self) -> bool;
+    fn get_process_id(&self) -> i32;
+}
+
+impl ClientInnerExt for tokio_postgres::Client {
+    fn is_closed(&self) -> bool {
+        self.is_closed()
+    }
+
+    fn get_process_id(&self) -> i32 {
+        self.get_process_id()
+    }
+}
+
+pub(crate) struct Discard<'a, C: ClientInnerExt> {
+    conn_info: &'a ConnInfo,
+    pool: &'a mut Weak<RwLock<EndpointConnPool<C>>>,
+}
+
+impl<C: ClientInnerExt> Discard<'_, C> {
+    pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) {
+        let conn_info = &self.conn_info;
+        if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
+            info!("pool: throwing away connection '{conn_info}' because connection is not idle");
+        }
+    }
+    pub(crate) fn discard(&mut self) {
+        let conn_info = &self.conn_info;
+        if std::mem::take(self.pool).strong_count() > 0 {
+            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
+        }
+    }
+}
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index 9b6bc98557..79bb19328f 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -10,11 +10,12 @@ use rand::Rng;
 use tokio::net::TcpStream;
 use tracing::{debug, error, info, info_span, Instrument};
 
-use super::conn_pool::ConnInfo;
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
+
+use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
 use crate::EndpointCacheKey;
 
 pub(crate) type Send = http2::SendRequest<hyper::body::Incoming>;
@@ -22,15 +23,15 @@ pub(crate) type Connect =
     http2::Connection<TokioIo<TcpStream>, hyper::body::Incoming, TokioExecutor>;
 
 #[derive(Clone)]
-struct ConnPoolEntry {
-    conn: Send,
+pub(crate) struct ConnPoolEntry<C: ClientInnerExt + Clone> {
+    conn: C,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
 }
 
 // Per-endpoint connection pool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
-pub(crate) struct EndpointConnPool {
+pub(crate) struct EndpointConnPool<C: ClientInnerExt + Clone> {
     // TODO(conrad):
     // either we should open more connections depending on stream count
     // (not exposed by hyper, need our own counter)
@@ -40,13 +41,13 @@ pub(crate) struct EndpointConnPool {
     // seems somewhat redundant though.
     //
     // Probably we should run a semaphore and just the single conn. TBD.
-    conns: VecDeque<ConnPoolEntry>,
+    conns: VecDeque<ConnPoolEntry<C>>,
     _guard: HttpEndpointPoolsGuard<'static>,
     global_connections_count: Arc<AtomicUsize>,
 }
 
-impl EndpointConnPool {
-    fn get_conn_entry(&mut self) -> Option<ConnPoolEntry> {
+impl<C: ClientInnerExt + Clone> EndpointConnPool<C> {
+    fn get_conn_entry(&mut self) -> Option<ConnPoolEntry<C>> {
         let Self { conns, .. } = self;
 
         loop {
@@ -81,7 +82,7 @@ impl EndpointConnPool {
     }
 }
 
-impl Drop for EndpointConnPool {
+impl<C: ClientInnerExt + Clone> Drop for EndpointConnPool<C> {
     fn drop(&mut self) {
         if !self.conns.is_empty() {
             self.global_connections_count
@@ -95,12 +96,12 @@ impl Drop for EndpointConnPool {
     }
 }
 
-pub(crate) struct GlobalConnPool {
+pub(crate) struct GlobalConnPool<C: ClientInnerExt + Clone> {
     // endpoint -> per-endpoint connection pool
     //
     // That should be a fairly conteded map, so return reference to the per-endpoint
     // pool as early as possible and release the lock.
-    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool>>>,
+    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool<C>>>>,
 
     /// Number of endpoint-connection pools
     ///
@@ -115,7 +116,7 @@ pub(crate) struct GlobalConnPool {
     config: &'static crate::config::HttpConfig,
 }
 
-impl GlobalConnPool {
+impl<C: ClientInnerExt + Clone> GlobalConnPool<C> {
     pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
         let shards = config.pool_options.pool_shards;
         Arc::new(Self {
@@ -210,7 +211,7 @@ impl GlobalConnPool {
         self: &Arc<Self>,
         ctx: &RequestMonitoring,
         conn_info: &ConnInfo,
-    ) -> Option<Client> {
+    ) -> Option<Client<C>> {
         let endpoint = conn_info.endpoint_cache_key()?;
         let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
         let client = endpoint_pool.write().get_conn_entry()?;
@@ -228,7 +229,7 @@ impl GlobalConnPool {
     fn get_or_create_endpoint_pool(
         self: &Arc<Self>,
         endpoint: &EndpointCacheKey,
-    ) -> Arc<RwLock<EndpointConnPool>> {
+    ) -> Arc<RwLock<EndpointConnPool<C>>> {
         // fast path
         if let Some(pool) = self.global_pool.get(endpoint) {
             return pool.clone();
@@ -268,14 +269,14 @@ impl GlobalConnPool {
 }
 
 pub(crate) fn poll_http2_client(
-    global_pool: Arc<GlobalConnPool>,
+    global_pool: Arc<GlobalConnPool<Send>>,
     ctx: &RequestMonitoring,
     conn_info: &ConnInfo,
     client: Send,
     connection: Connect,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
-) -> Client {
+) -> Client<Send> {
     let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol());
     let session_id = ctx.session_id();
 
@@ -322,13 +323,13 @@ pub(crate) fn poll_http2_client(
     Client::new(client, aux)
 }
 
-pub(crate) struct Client {
-    pub(crate) inner: Send,
+pub(crate) struct Client<C: ClientInnerExt + Clone> {
+    pub(crate) inner: C,
     aux: MetricsAuxInfo,
 }
 
-impl Client {
-    pub(self) fn new(inner: Send, aux: MetricsAuxInfo) -> Self {
+impl<C: ClientInnerExt + Clone> Client<C> {
+    pub(self) fn new(inner: C, aux: MetricsAuxInfo) -> Self {
         Self { inner, aux }
     }
 
@@ -339,3 +340,14 @@ impl Client {
         })
     }
 }
+
+impl ClientInnerExt for Send {
+    fn is_closed(&self) -> bool {
+        self.is_closed()
+    }
+
+    fn get_process_id(&self) -> i32 {
+        // ideally throw something meaningful
+        -1
+    }
+}
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index 5df37a8762..c4fdd00f78 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -20,11 +20,12 @@ use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument, Span};
 
 use super::backend::HttpConnError;
-use super::conn_pool::{ClientInnerExt, ConnInfo};
+use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::Metrics;
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
+
 use crate::{DbName, RoleName};
 
 struct ConnPoolEntry<C: ClientInnerExt> {
@@ -362,7 +363,7 @@ pub(crate) fn poll_client(
     LocalClient::new(inner, conn_info, pool_clone)
 }
 
-struct ClientInner<C: ClientInnerExt> {
+pub(crate) struct ClientInner<C: ClientInnerExt> {
     inner: C,
     session: tokio::sync::watch::Sender<uuid::Uuid>,
     cancel: CancellationToken,
@@ -387,13 +388,24 @@ impl<C: ClientInnerExt> ClientInner<C> {
     }
 }
 
-impl<C: ClientInnerExt> LocalClient<C> {
-    pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
-        let aux = &self.inner.as_ref().unwrap().aux;
-        USAGE_METRICS.register(Ids {
-            endpoint_id: aux.endpoint_id,
-            branch_id: aux.branch_id,
-        })
+impl ClientInner<tokio_postgres::Client> {
+    pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> {
+        self.jti += 1;
+        let token = resign_jwt(&self.key, payload, self.jti)?;
+
+        // initiates the auth session
+        self.inner.simple_query("discard all").await?;
+        self.inner
+            .query(
+                "select auth.jwt_session_init($1)",
+                &[&token as &(dyn ToSql + Sync)],
+            )
+            .await?;
+
+        let pid = self.inner.get_process_id();
+        info!(pid, jti = self.jti, "user session state init");
+
+        Ok(())
     }
 }
 
@@ -422,6 +434,18 @@ impl<C: ClientInnerExt> LocalClient<C> {
             pool,
         }
     }
+
+    pub(crate) fn client_inner(&mut self) -> (&mut ClientInner<C>, Discard<'_, C>) {
+        let Self {
+            inner,
+            pool,
+            conn_info,
+            span: _,
+        } = self;
+        let inner_m = inner.as_mut().expect("client inner should not be removed");
+        (inner_m, Discard { conn_info, pool })
+    }
+
     pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
         let Self {
             inner,
@@ -434,33 +458,6 @@ impl<C: ClientInnerExt> LocalClient<C> {
     }
 }
 
-impl LocalClient<tokio_postgres::Client> {
-    pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> {
-        let inner = self
-            .inner
-            .as_mut()
-            .expect("client inner should not be removed");
-
-        inner.jti += 1;
-        let token = resign_jwt(&inner.key, payload, inner.jti)?;
-
-        // initiates the auth session
-        inner.inner.simple_query("discard all").await?;
-        inner
-            .inner
-            .query(
-                "select auth.jwt_session_init($1)",
-                &[&token as &(dyn ToSql + Sync)],
-            )
-            .await?;
-
-        let pid = inner.inner.get_process_id();
-        info!(pid, jti = inner.jti, "user session state init");
-
-        Ok(())
-    }
-}
-
 /// implements relatively efficient in-place json object key upserting
 ///
 /// only supports top-level keys
@@ -524,24 +521,15 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
     jwt
 }
 
-impl<C: ClientInnerExt> Discard<'_, C> {
-    pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) {
-        let conn_info = &self.conn_info;
-        if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
-            info!(
-                "local_pool: throwing away connection '{conn_info}' because connection is not idle"
-            );
-        }
-    }
-    pub(crate) fn discard(&mut self) {
-        let conn_info = &self.conn_info;
-        if std::mem::take(self.pool).strong_count() > 0 {
-            info!("local_pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
-        }
-    }
-}
-
 impl<C: ClientInnerExt> LocalClient<C> {
+    pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
+        let aux = &self.inner.as_ref().unwrap().aux;
+        USAGE_METRICS.register(Ids {
+            endpoint_id: aux.endpoint_id,
+            branch_id: aux.branch_id,
+        })
+    }
+
     fn do_drop(&mut self) -> Option<impl FnOnce()> {
         let conn_info = self.conn_info.clone();
         let client = self
@@ -568,6 +556,23 @@ impl<C: ClientInnerExt> Drop for LocalClient<C> {
     }
 }
 
+impl<C: ClientInnerExt> Discard<'_, C> {
+    pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) {
+        let conn_info = &self.conn_info;
+        if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
+            info!(
+                "local_pool: throwing away connection '{conn_info}' because connection is not idle"
+            );
+        }
+    }
+    pub(crate) fn discard(&mut self) {
+        let conn_info = &self.conn_info;
+        if std::mem::take(self.pool).strong_count() > 0 {
+            info!("local_pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use p256::ecdsa::SigningKey;
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index 3ed3b6c845..29ff7b9d91 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -5,6 +5,7 @@
 mod backend;
 pub mod cancel_set;
 mod conn_pool;
+mod conn_pool_lib;
 mod http_conn_pool;
 mod http_util;
 mod json;
@@ -20,7 +21,7 @@ use anyhow::Context;
 use async_trait::async_trait;
 use atomic_take::AtomicTake;
 use bytes::Bytes;
-pub use conn_pool::GlobalConnPoolOptions;
+pub use conn_pool_lib::GlobalConnPoolOptions;
 use futures::future::{select, Either};
 use futures::TryFutureExt;
 use http::{Method, Response, StatusCode};
@@ -65,7 +66,7 @@ pub async fn task_main(
     }
 
     let local_pool = local_conn_pool::LocalConnPool::new(&config.http_config);
-    let conn_pool = conn_pool::GlobalConnPool::new(&config.http_config);
+    let conn_pool = conn_pool_lib::GlobalConnPool::new(&config.http_config);
     {
         let conn_pool = Arc::clone(&conn_pool);
         tokio::spawn(async move {
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 3d8a2adef1..bb5eb390a6 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -25,10 +25,11 @@ use urlencoding;
 use utils::http::error::ApiError;
 
 use super::backend::{LocalProxyConnError, PoolingBackend};
-use super::conn_pool::{AuthData, ConnInfo, ConnInfoWithAuth};
+use super::conn_pool::{AuthData, ConnInfoWithAuth};
+use super::conn_pool_lib::{self, ConnInfo};
 use super::http_util::json_response;
 use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError};
-use super::{conn_pool, local_conn_pool};
+use super::local_conn_pool;
 use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
 use crate::auth::{endpoint_sni, ComputeUserInfoParseError};
 use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig};
@@ -37,6 +38,7 @@ use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::metrics::{HttpDirection, Metrics};
 use crate::proxy::{run_until_cancelled, NeonOptions};
 use crate::serverless::backend::HttpConnError;
+
 use crate::usage_metrics::{MetricCounter, MetricCounterRecorder};
 use crate::{DbName, RoleName};
 
@@ -607,7 +609,8 @@ async fn handle_db_inner(
             let client = match keys.keys {
                 ComputeCredentialKeys::JwtPayload(payload) if is_local_proxy => {
                     let mut client = backend.connect_to_local_postgres(ctx, conn_info).await?;
-                    client.set_jwt_session(&payload).await?;
+                    let (cli_inner, _dsc) = client.client_inner();
+                    cli_inner.set_jwt_session(&payload).await?;
                     Client::Local(client)
                 }
                 _ => {
@@ -1021,12 +1024,12 @@ async fn query_to_json<T: GenericClient>(
 }
 
 enum Client {
-    Remote(conn_pool::Client<tokio_postgres::Client>),
+    Remote(conn_pool_lib::Client<tokio_postgres::Client>),
     Local(local_conn_pool::LocalClient<tokio_postgres::Client>),
 }
 
 enum Discard<'a> {
-    Remote(conn_pool::Discard<'a, tokio_postgres::Client>),
+    Remote(conn_pool_lib::Discard<'a, tokio_postgres::Client>),
     Local(local_conn_pool::Discard<'a, tokio_postgres::Client>),
 }
 
@@ -1041,7 +1044,7 @@ impl Client {
     fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) {
         match self {
             Client::Remote(client) => {
-                let (c, d) = client.inner();
+                let (c, d) = client.inner_mut();
                 (c, Discard::Remote(d))
             }
             Client::Local(local_client) => {

From 35e7d91bc9eb07c8ef70acef5e224c9b9e78a0ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 17 Oct 2024 14:07:58 +0200
Subject: [PATCH 36/57] Add config variable for timeline offloading (#9421)

Adds a configuration variable for timeline offloading support. The added
pageserver-global config option controls whether the pageserver
automatically offloads timelines during compaction.

Therefore, already offloaded timelines are not affected by this, nor is
the manual testing endpoint.

This allows the rollout of timeline offloading to be driven by the
storage team.

Part of #8088
---
 libs/pageserver_api/src/config.rs            | 2 ++
 pageserver/src/config.rs                     | 5 +++++
 pageserver/src/tenant.rs                     | 3 ++-
 pageserver/src/tenant/timeline.rs            | 1 +
 test_runner/regress/test_timeline_archive.py | 4 ++++
 5 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 24474d4840..896a5d8069 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -102,6 +102,7 @@ pub struct ConfigToml {
     pub ingest_batch_size: u64,
     pub max_vectored_read_bytes: MaxVectoredReadBytes,
     pub image_compression: ImageCompressionAlgorithm,
+    pub timeline_offloading: bool,
     pub ephemeral_bytes_per_memory_kb: usize,
     pub l0_flush: Option<crate::models::L0FlushConfig>,
     pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
@@ -385,6 +386,7 @@ impl Default for ConfigToml {
                 NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
             )),
             image_compression: (DEFAULT_IMAGE_COMPRESSION),
+            timeline_offloading: false,
             ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: None,
             virtual_file_io_mode: None,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 8db78285e4..06d4326459 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -164,6 +164,9 @@ pub struct PageServerConf {
 
     pub image_compression: ImageCompressionAlgorithm,
 
+    /// Whether to offload archived timelines automatically
+    pub timeline_offloading: bool,
+
     /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
     /// is exceeded, we start proactively closing ephemeral layers to limit the total amount
     /// of ephemeral data.
@@ -321,6 +324,7 @@ impl PageServerConf {
             ingest_batch_size,
             max_vectored_read_bytes,
             image_compression,
+            timeline_offloading,
             ephemeral_bytes_per_memory_kb,
             l0_flush,
             virtual_file_io_mode,
@@ -364,6 +368,7 @@ impl PageServerConf {
             ingest_batch_size,
             max_vectored_read_bytes,
             image_compression,
+            timeline_offloading,
             ephemeral_bytes_per_memory_kb,
 
             // ------------------------------------------------------------
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 689982ddd4..baa2365658 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2187,7 +2187,8 @@ impl Tenant {
                             .iter()
                             .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id))
                     };
-                    let can_offload = can_offload && has_no_unoffloaded_children;
+                    let can_offload =
+                        can_offload && has_no_unoffloaded_children && self.conf.timeline_offloading;
                     if (is_active, can_offload) == (false, false) {
                         None
                     } else {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1992dee930..2b4f949c76 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1565,6 +1565,7 @@ impl Timeline {
     }
 
     /// Checks if the internal state of the timeline is consistent with it being able to be offloaded.
+    ///
     /// This is neccessary but not sufficient for offloading of the timeline as it might have
     /// child timelines that are not offloaded yet.
     pub(crate) fn can_offload(&self) -> bool {
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index ffaed5e130..85e1077fd5 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -119,6 +119,10 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int):
 
 @pytest.mark.parametrize("manual_offload", [False, True])
 def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: bool):
+    if not manual_offload:
+        # (automatic) timeline offloading defaults to false for now
+        neon_env_builder.pageserver_config_override = "timeline_offloading = true"
+
     env = neon_env_builder.init_start()
     ps_http = env.pageserver.http_client()
 

From 8b479381403cd2be8f7bc7eba69d5074735d8924 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 17 Oct 2024 13:37:21 +0100
Subject: [PATCH 37/57] Add support of extensions for v17 (part 3) (#9430)

- pgvector 7.4

update support of extensions for v14-v16:
- pgvector 7.2 -> 7.4
---
 compute/Dockerfile.compute-node | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node
index b0ce7c1718..45c1fd9f38 100644
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -353,13 +353,10 @@ COPY compute/patches/pgvector.patch /pgvector.patch
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
 #
-# v17 is not supported yet because of upstream issue
-# https://github.com/pgvector/pgvector/issues/669
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \
-    echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \
+# vector 0.7.4 supports v17
+# last release v0.7.4 - Aug 5, 2024
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.4.tar.gz -O pgvector.tar.gz && \
+    echo "0341edf89b1924ae0d552f617e14fb7f8867c0194ed775bcc44fa40288642583 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
     patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \

From a7c05686ccbebc856b0ce389a9fa60d2bddbeea6 Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Thu, 17 Oct 2024 17:20:42 +0300
Subject: [PATCH 38/57] test_runner: Update the README.md to build neon with
 'testing' (#9437)

Without having the '--features testing' in the cargo build the proxy
won't start causing tests to fail.
---
 test_runner/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/README.md b/test_runner/README.md
index e087241c1f..55d8d2faa9 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -6,7 +6,7 @@ Prerequisites:
 - Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python)
 - Neon and Postgres binaries
     - See the root [README.md](/README.md) for build directions
-      If you want to test tests with test-only APIs, you would need to add `--features testing` to Rust code build commands.
+      To run tests you need to add `--features testing` to Rust code build commands.
       For convenience, repository cargo config contains `build_testing` alias, that serves as a subcommand, adding the required feature flags.
       Usage example: `cargo build_testing --release` is equivalent to `cargo build --features testing --release`
     - Tests can be run from the git tree; or see the environment variables

From f3a3eefd26284776ab3179116374009ec537ab11 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 17 Oct 2024 10:29:53 -0400
Subject: [PATCH 39/57] feat(pageserver): do space check before gc-compaction
 (#9250)

part of https://github.com/neondatabase/neon/issues/9114

## Summary of changes

gc-compaction may take a lot of disk space, and if it does, the caller
should do a partial gc-compaction. This patch adds space check for the
compaction job.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/disk_usage_eviction_task.rs   | 11 +----
 pageserver/src/statvfs.rs                    | 16 ++++++++
 pageserver/src/tenant/storage_layer/layer.rs |  4 ++
 pageserver/src/tenant/timeline/compaction.rs | 42 ++++++++++++++++++++
 4 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index a58fa2c0b1..7ab2ba8742 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -1218,16 +1218,7 @@ mod filesystem_level_usage {
         let stat = Statvfs::get(tenants_dir, mock_config)
             .context("statvfs failed, presumably directory got unlinked")?;
 
-        // https://unix.stackexchange.com/a/703650
-        let blocksize = if stat.fragment_size() > 0 {
-            stat.fragment_size()
-        } else {
-            stat.block_size()
-        };
-
-        // use blocks_available (b_avail) since, pageserver runs as unprivileged user
-        let avail_bytes = stat.blocks_available() * blocksize;
-        let total_bytes = stat.blocks() * blocksize;
+        let (avail_bytes, total_bytes) = stat.get_avail_total_bytes();
 
         Ok(Usage {
             config,
diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs
index 5a6f6e5176..205605bc86 100644
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -53,6 +53,22 @@ impl Statvfs {
             Statvfs::Mock(stat) => stat.block_size,
         }
     }
+
+    /// Get the available and total bytes on the filesystem.
+    pub fn get_avail_total_bytes(&self) -> (u64, u64) {
+        // https://unix.stackexchange.com/a/703650
+        let blocksize = if self.fragment_size() > 0 {
+            self.fragment_size()
+        } else {
+            self.block_size()
+        };
+
+        // use blocks_available (b_avail) since, pageserver runs as unprivileged user
+        let avail_bytes = self.blocks_available() * blocksize;
+        let total_bytes = self.blocks() * blocksize;
+
+        (avail_bytes, total_bytes)
+    }
 }
 
 pub mod mock {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index bbb21b180e..f29a33bae6 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -341,6 +341,10 @@ impl Layer {
         Ok(())
     }
 
+    pub(crate) async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
+        self.0.needs_download().await
+    }
+
     /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction
     /// while the guard exists.
     ///
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 8b9ace1e5b..5588363330 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -29,6 +29,7 @@ use utils::id::TimelineId;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
+use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::filter_iterator::FilterIterator;
@@ -1691,6 +1692,45 @@ impl Timeline {
         unreachable!("key retention is empty")
     }
 
+    /// Check how much space is left on the disk
+    async fn check_available_space(self: &Arc<Self>) -> anyhow::Result<u64> {
+        let tenants_dir = self.conf.tenants_path();
+
+        let stat = Statvfs::get(&tenants_dir, None)
+            .context("statvfs failed, presumably directory got unlinked")?;
+
+        let (avail_bytes, _) = stat.get_avail_total_bytes();
+
+        Ok(avail_bytes)
+    }
+
+    /// Check if the compaction can proceed safely without running out of space. We assume the size
+    /// upper bound of the produced files of a compaction job is the same as all layers involved in
+    /// the compaction. Therefore, we need `2 * layers_to_be_compacted_size` at least to do a
+    /// compaction.
+    async fn check_compaction_space(
+        self: &Arc<Self>,
+        layer_selection: &[Layer],
+    ) -> anyhow::Result<()> {
+        let available_space = self.check_available_space().await?;
+        let mut remote_layer_size = 0;
+        let mut all_layer_size = 0;
+        for layer in layer_selection {
+            let needs_download = layer.needs_download().await?;
+            if needs_download.is_some() {
+                remote_layer_size += layer.layer_desc().file_size;
+            }
+            all_layer_size += layer.layer_desc().file_size;
+        }
+        let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */
+        if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space
+        {
+            return Err(anyhow!("not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}",
+                available_space, allocated_space, all_layer_size, remote_layer_size, all_layer_size + remote_layer_size));
+        }
+        Ok(())
+    }
+
     /// An experimental compaction building block that combines compaction with garbage collection.
     ///
     /// The current implementation picks all delta + image layers that are below or intersecting with
@@ -1806,6 +1846,8 @@ impl Timeline {
             lowest_retain_lsn
         );
 
+        self.check_compaction_space(&layer_selection).await?;
+
         // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
         // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
         let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)

From 4c9835f4a3065648c2d6ecd721664b88557aca0f Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 17 Oct 2024 16:34:51 +0200
Subject: [PATCH 40/57] storage_controller: delete stale shards when deleting
 tenant (#9333)

## Problem

Tenant deletion only removes the current shards from remote storage. Any
stale parent shards (before splits) will be left behind. These shards
are kept since child shards may reference data from the parent until new
image layers are generated.

## Summary of changes

* Document a special case for pageserver tenant deletion that deletes
all shards in remote storage when given an unsharded tenant ID, as well
as any unsharded tenant data.
* Pass an unsharded tenant ID to delete all remote storage under the
tenant ID prefix.
* Split out `RemoteStorage::delete_prefix()` to delete a bucket prefix,
with additional test coverage.
* Add a `delimiter` argument to `asset_prefix_empty()` to support
partial prefix matches (i.e. all shards starting with a given tenant
ID).
---
 libs/remote_storage/src/lib.rs            |  53 +++++-
 libs/remote_storage/tests/common/tests.rs | 206 ++++++++++++++++++++++
 pageserver/src/tenant/mgr.rs              |  71 +++-----
 storage_controller/src/service.rs         |  73 ++++----
 test_runner/fixtures/pageserver/utils.py  |  15 +-
 test_runner/regress/test_tenant_delete.py |  55 ++++++
 6 files changed, 376 insertions(+), 97 deletions(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index c6466237bf..719608dd5f 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -19,7 +19,12 @@ mod simulate_failures;
 mod support;
 
 use std::{
-    collections::HashMap, fmt::Debug, num::NonZeroU32, ops::Bound, pin::Pin, sync::Arc,
+    collections::HashMap,
+    fmt::Debug,
+    num::NonZeroU32,
+    ops::Bound,
+    pin::{pin, Pin},
+    sync::Arc,
     time::SystemTime,
 };
 
@@ -28,6 +33,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 
 use bytes::Bytes;
 use futures::{stream::Stream, StreamExt};
+use itertools::Itertools as _;
 use serde::{Deserialize, Serialize};
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
@@ -261,7 +267,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
         max_keys: Option<NonZeroU32>,
         cancel: &CancellationToken,
     ) -> Result<Listing, DownloadError> {
-        let mut stream = std::pin::pin!(self.list_streaming(prefix, mode, max_keys, cancel));
+        let mut stream = pin!(self.list_streaming(prefix, mode, max_keys, cancel));
         let mut combined = stream.next().await.expect("At least one item required")?;
         while let Some(list) = stream.next().await {
             let list = list?;
@@ -324,6 +330,35 @@ pub trait RemoteStorage: Send + Sync + 'static {
         cancel: &CancellationToken,
     ) -> anyhow::Result<()>;
 
+    /// Deletes all objects matching the given prefix.
+    ///
+    /// NB: this uses NoDelimiter and will match partial prefixes. For example, the prefix /a/b will
+    /// delete /a/b, /a/b/*, /a/bc, /a/bc/*, etc.
+    ///
+    /// If the operation fails because of timeout or cancellation, the root cause of the error will
+    /// be set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went
+    /// through.
+    async fn delete_prefix(
+        &self,
+        prefix: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        let mut stream =
+            pin!(self.list_streaming(Some(prefix), ListingMode::NoDelimiter, None, cancel));
+        while let Some(result) = stream.next().await {
+            let keys = match result {
+                Ok(listing) if listing.keys.is_empty() => continue,
+                Ok(listing) => listing.keys.into_iter().map(|o| o.key).collect_vec(),
+                Err(DownloadError::Cancelled) => return Err(TimeoutOrCancel::Cancel.into()),
+                Err(DownloadError::Timeout) => return Err(TimeoutOrCancel::Timeout.into()),
+                Err(err) => return Err(err.into()),
+            };
+            tracing::info!("Deleting {} keys from remote storage", keys.len());
+            self.delete_objects(&keys, cancel).await?;
+        }
+        Ok(())
+    }
+
     /// Copy a remote object inside a bucket from one path to another.
     async fn copy(
         &self,
@@ -488,6 +523,20 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         }
     }
 
+    /// See [`RemoteStorage::delete_prefix`]
+    pub async fn delete_prefix(
+        &self,
+        prefix: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        match self {
+            Self::LocalFs(s) => s.delete_prefix(prefix, cancel).await,
+            Self::AwsS3(s) => s.delete_prefix(prefix, cancel).await,
+            Self::AzureBlob(s) => s.delete_prefix(prefix, cancel).await,
+            Self::Unreliable(s) => s.delete_prefix(prefix, cancel).await,
+        }
+    }
+
     /// See [`RemoteStorage::copy`]
     pub async fn copy_object(
         &self,
diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs
index e6f33fc3f8..d5da1d48e9 100644
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -199,6 +199,138 @@ async fn list_no_delimiter_works(
     Ok(())
 }
 
+/// Tests that giving a partial prefix returns all matches (e.g. "/foo" yields "/foobar/baz"),
+/// but only with NoDelimiter.
+#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
+#[tokio::test]
+async fn list_partial_prefix(
+    ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
+) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("S3 init failed: {e:?}")
+        }
+    };
+
+    let cancel = CancellationToken::new();
+    let test_client = Arc::clone(&ctx.enabled.client);
+
+    // Prefix "fold" should match all "folder{i}" directories with NoDelimiter.
+    let objects: HashSet<_> = test_client
+        .list(
+            Some(&RemotePath::from_string("fold")?),
+            ListingMode::NoDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .keys
+        .into_iter()
+        .map(|o| o.key)
+        .collect();
+    assert_eq!(&objects, &ctx.remote_blobs);
+
+    // Prefix "fold" matches nothing with WithDelimiter.
+    let objects: HashSet<_> = test_client
+        .list(
+            Some(&RemotePath::from_string("fold")?),
+            ListingMode::WithDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .keys
+        .into_iter()
+        .map(|o| o.key)
+        .collect();
+    assert!(objects.is_empty());
+
+    // Prefix "" matches everything.
+    let objects: HashSet<_> = test_client
+        .list(
+            Some(&RemotePath::from_string("")?),
+            ListingMode::NoDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .keys
+        .into_iter()
+        .map(|o| o.key)
+        .collect();
+    assert_eq!(&objects, &ctx.remote_blobs);
+
+    // Prefix "" matches nothing with WithDelimiter.
+    let objects: HashSet<_> = test_client
+        .list(
+            Some(&RemotePath::from_string("")?),
+            ListingMode::WithDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .keys
+        .into_iter()
+        .map(|o| o.key)
+        .collect();
+    assert!(objects.is_empty());
+
+    // Prefix "foo" matches nothing.
+    let objects: HashSet<_> = test_client
+        .list(
+            Some(&RemotePath::from_string("foo")?),
+            ListingMode::NoDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .keys
+        .into_iter()
+        .map(|o| o.key)
+        .collect();
+    assert!(objects.is_empty());
+
+    // Prefix "folder2/blob" matches.
+    let objects: HashSet<_> = test_client
+        .list(
+            Some(&RemotePath::from_string("folder2/blob")?),
+            ListingMode::NoDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .keys
+        .into_iter()
+        .map(|o| o.key)
+        .collect();
+    let expect: HashSet<_> = ctx
+        .remote_blobs
+        .iter()
+        .filter(|o| o.get_path().starts_with("folder2"))
+        .cloned()
+        .collect();
+    assert_eq!(&objects, &expect);
+
+    // Prefix "folder2/foo" matches nothing.
+    let objects: HashSet<_> = test_client
+        .list(
+            Some(&RemotePath::from_string("folder2/foo")?),
+            ListingMode::NoDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .keys
+        .into_iter()
+        .map(|o| o.key)
+        .collect();
+    assert!(objects.is_empty());
+
+    Ok(())
+}
+
 #[test_context(MaybeEnabledStorage)]
 #[tokio::test]
 async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
@@ -265,6 +397,80 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
     Ok(())
 }
 
+/// Tests that delete_prefix() will delete all objects matching a prefix, including
+/// partial prefixes (i.e. "/foo" matches "/foobar").
+#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
+#[tokio::test]
+async fn delete_prefix(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("S3 init failed: {e:?}")
+        }
+    };
+
+    let cancel = CancellationToken::new();
+    let test_client = Arc::clone(&ctx.enabled.client);
+
+    /// Asserts that the S3 listing matches the given paths.
+    macro_rules! assert_list {
+        ($expect:expr) => {{
+            let listing = test_client
+                .list(None, ListingMode::NoDelimiter, None, &cancel)
+                .await?
+                .keys
+                .into_iter()
+                .map(|o| o.key)
+                .collect();
+            assert_eq!($expect, listing);
+        }};
+    }
+
+    // We start with the full set of uploaded files.
+    let mut expect = ctx.remote_blobs.clone();
+
+    // Deleting a non-existing prefix should do nothing.
+    test_client
+        .delete_prefix(&RemotePath::from_string("xyz")?, &cancel)
+        .await?;
+    assert_list!(expect);
+
+    // Prefixes are case-sensitive.
+    test_client
+        .delete_prefix(&RemotePath::from_string("Folder")?, &cancel)
+        .await?;
+    assert_list!(expect);
+
+    // Deleting a path which overlaps with an existing object should do nothing. We pick the first
+    // path in the set as our common prefix.
+    let path = expect.iter().next().expect("empty set").clone().join("xyz");
+    test_client.delete_prefix(&path, &cancel).await?;
+    assert_list!(expect);
+
+    // Deleting an exact path should work. We pick the first path in the set.
+    let path = expect.iter().next().expect("empty set").clone();
+    test_client.delete_prefix(&path, &cancel).await?;
+    expect.remove(&path);
+    assert_list!(expect);
+
+    // Deleting a prefix should delete all matching objects.
+    test_client
+        .delete_prefix(&RemotePath::from_string("folder0/blob_")?, &cancel)
+        .await?;
+    expect.retain(|p| !p.get_path().as_str().starts_with("folder0/"));
+    assert_list!(expect);
+
+    // Deleting a common prefix should delete all objects.
+    test_client
+        .delete_prefix(&RemotePath::from_string("fold")?, &cancel)
+        .await?;
+    expect.clear();
+    assert_list!(expect);
+
+    Ok(())
+}
+
 #[test_context(MaybeEnabledStorage)]
 #[tokio::test]
 async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 9d9852c525..0567f8f3a7 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -11,6 +11,7 @@ use pageserver_api::shard::{
 };
 use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::{distributions::Alphanumeric, Rng};
+use remote_storage::TimeoutOrCancel;
 use std::borrow::Cow;
 use std::cmp::Ordering;
 use std::collections::{BTreeMap, HashMap, HashSet};
@@ -1350,47 +1351,17 @@ impl TenantManager {
         }
     }
 
-    async fn delete_tenant_remote(
-        &self,
-        tenant_shard_id: TenantShardId,
-    ) -> Result<(), DeleteTenantError> {
-        let remote_path = remote_tenant_path(&tenant_shard_id);
-        let mut keys_stream = self.resources.remote_storage.list_streaming(
-            Some(&remote_path),
-            remote_storage::ListingMode::NoDelimiter,
-            None,
-            &self.cancel,
-        );
-        while let Some(chunk) = keys_stream.next().await {
-            let keys = match chunk {
-                Ok(listing) => listing.keys,
-                Err(remote_storage::DownloadError::Cancelled) => {
-                    return Err(DeleteTenantError::Cancelled)
-                }
-                Err(remote_storage::DownloadError::NotFound) => return Ok(()),
-                Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
-            };
-
-            if keys.is_empty() {
-                tracing::info!("Remote storage already deleted");
-            } else {
-                tracing::info!("Deleting {} keys from remote storage", keys.len());
-                let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
-                self.resources
-                    .remote_storage
-                    .delete_objects(&keys, &self.cancel)
-                    .await?;
-            }
-        }
-
-        Ok(())
-    }
-
     /// If a tenant is attached, detach it.  Then remove its data from remote storage.
     ///
     /// A tenant is considered deleted once it is gone from remote storage.  It is the caller's
     /// responsibility to avoid trying to attach the tenant again or use it any way once deletion
     /// has started: this operation is not atomic, and must be retried until it succeeds.
+    ///
+    /// As a special case, if an unsharded tenant ID is given for a sharded tenant, it will remove
+    /// all tenant shards in remote storage (removing all paths with the tenant prefix). The storage
+    /// controller uses this to purge all remote tenant data, including any stale parent shards that
+    /// may remain after splits. Ideally, this special case would be handled elsewhere. See:
+    /// <https://github.com/neondatabase/neon/pull/9394>.
     pub(crate) async fn delete_tenant(
         &self,
         tenant_shard_id: TenantShardId,
@@ -1442,25 +1413,29 @@ impl TenantManager {
         //   in 500 responses to delete requests.
         // - We keep the `SlotGuard` during this I/O, so that if a concurrent delete request comes in, it will
         //   503/retry, rather than kicking off a wasteful concurrent deletion.
-        match backoff::retry(
-            || async move { self.delete_tenant_remote(tenant_shard_id).await },
-            |e| match e {
-                DeleteTenantError::Cancelled => true,
-                DeleteTenantError::SlotError(_) => {
-                    unreachable!("Remote deletion doesn't touch slots")
-                }
-                _ => false,
+        // NB: this also deletes partial prefixes, i.e. a <tenant_id> path will delete all
+        // <tenant_id>_<shard_id>/* objects. See method comment for why.
+        backoff::retry(
+            || async move {
+                self.resources
+                    .remote_storage
+                    .delete_prefix(&remote_tenant_path(&tenant_shard_id), &self.cancel)
+                    .await
             },
+            |_| false, // backoff::retry handles cancellation
             1,
             3,
             &format!("delete_tenant[tenant_shard_id={tenant_shard_id}]"),
             &self.cancel,
         )
         .await
-        {
-            Some(r) => r,
-            None => Err(DeleteTenantError::Cancelled),
-        }
+        .unwrap_or(Err(TimeoutOrCancel::Cancel.into()))
+        .map_err(|err| {
+            if TimeoutOrCancel::caused_by_cancel(&err) {
+                return DeleteTenantError::Cancelled;
+            }
+            DeleteTenantError::Other(err)
+        })
     }
 
     #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 25e1fb5e1f..ab2c3b5e48 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2862,17 +2862,12 @@ impl Service {
         let _tenant_lock =
             trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await;
 
-        // Detach all shards
-        let (detach_waiters, shard_ids, node) = {
-            let mut shard_ids = Vec::new();
+        // Detach all shards. This also deletes local pageserver shard data.
+        let (detach_waiters, node) = {
             let mut detach_waiters = Vec::new();
             let mut locked = self.inner.write().unwrap();
             let (nodes, tenants, scheduler) = locked.parts_mut();
-            for (tenant_shard_id, shard) in
-                tenants.range_mut(TenantShardId::tenant_range(tenant_id))
-            {
-                shard_ids.push(*tenant_shard_id);
-
+            for (_, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
                 // Update the tenant's intent to remove all attachments
                 shard.policy = PlacementPolicy::Detached;
                 shard
@@ -2892,7 +2887,7 @@ impl Service {
             let node = nodes
                 .get(&node_id)
                 .expect("Pageservers may not be deleted while lock is active");
-            (detach_waiters, shard_ids, node.clone())
+            (detach_waiters, node.clone())
         };
 
         // This reconcile wait can fail in a few ways:
@@ -2907,38 +2902,34 @@ impl Service {
         self.await_waiters(detach_waiters, RECONCILE_TIMEOUT)
             .await?;
 
-        let locations = shard_ids
-            .into_iter()
-            .map(|s| (s, node.clone()))
-            .collect::<Vec<_>>();
-        let results = self.tenant_for_shards_api(
-            locations,
-            |tenant_shard_id, client| async move { client.tenant_delete(tenant_shard_id).await },
-            1,
-            3,
-            RECONCILE_TIMEOUT,
-            &self.cancel,
-        )
-        .await;
-        for result in results {
-            match result {
-                Ok(StatusCode::ACCEPTED) => {
-                    // This should never happen: we waited for detaches to finish above
-                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
-                        "Unexpectedly still attached on {}",
-                        node
-                    )));
-                }
-                Ok(_) => {}
-                Err(mgmt_api::Error::Cancelled) => {
-                    return Err(ApiError::ShuttingDown);
-                }
-                Err(e) => {
-                    // This is unexpected: remote deletion should be infallible, unless the object store
-                    // at large is unavailable.
-                    tracing::error!("Error deleting via node {}: {e}", node);
-                    return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
-                }
+        // Delete the entire tenant (all shards) from remote storage via a random pageserver.
+        // Passing an unsharded tenant ID will cause the pageserver to remove all remote paths with
+        // the tenant ID prefix, including all shards (even possibly stale ones).
+        match node
+            .with_client_retries(
+                |client| async move {
+                    client
+                        .tenant_delete(TenantShardId::unsharded(tenant_id))
+                        .await
+                },
+                &self.config.jwt_token,
+                1,
+                3,
+                RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await
+            .unwrap_or(Err(mgmt_api::Error::Cancelled))
+        {
+            Ok(_) => {}
+            Err(mgmt_api::Error::Cancelled) => {
+                return Err(ApiError::ShuttingDown);
+            }
+            Err(e) => {
+                // This is unexpected: remote deletion should be infallible, unless the object store
+                // at large is unavailable.
+                tracing::error!("Error deleting via node {node}: {e}");
+                return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
             }
         }
 
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 377a95fbeb..4c4306be9e 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -303,9 +303,10 @@ def assert_prefix_empty(
     remote_storage: Optional[RemoteStorage],
     prefix: Optional[str] = None,
     allowed_postfix: Optional[str] = None,
+    delimiter: str = "/",
 ) -> None:
     assert remote_storage is not None
-    response = list_prefix(remote_storage, prefix)
+    response = list_prefix(remote_storage, prefix, delimiter)
     keys = response["KeyCount"]
     objects: list[ObjectTypeDef] = response.get("Contents", [])
     common_prefixes = response.get("CommonPrefixes", [])
@@ -338,16 +339,18 @@ def assert_prefix_empty(
             if not (allowed_postfix.endswith(key)):
                 filtered_count += 1
 
-    assert (
-        filtered_count == 0
-    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
+    assert filtered_count == 0, f"remote prefix {prefix} is not empty: {objects}"
 
 
 # remote_storage must not be None, but that's easier for callers to make mypy happy
-def assert_prefix_not_empty(remote_storage: Optional[RemoteStorage], prefix: Optional[str] = None):
+def assert_prefix_not_empty(
+    remote_storage: Optional[RemoteStorage],
+    prefix: Optional[str] = None,
+    delimiter: str = "/",
+):
     assert remote_storage is not None
     response = list_prefix(remote_storage, prefix)
-    assert response["KeyCount"] != 0, f"remote dir with prefix {prefix} is empty: {response}"
+    assert response["KeyCount"] != 0, f"remote prefix {prefix} is empty: {response}"
 
 
 def list_prefix(
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 294c1248c5..f486327445 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -20,6 +20,7 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
 from fixtures.utils import run_pg_bench_small, wait_until
+from fixtures.workload import Workload
 from requests.exceptions import ReadTimeout
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
@@ -404,3 +405,57 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, make_httpserver, neon_env_builder
         cloud_admin_api_token=cloud_admin_token,
     )
     assert healthy
+
+
+def test_tenant_delete_stale_shards(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    """
+    Deleting a tenant should also delete any stale (pre-split) shards from remote storage.
+    """
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    env = neon_env_builder.init_start()
+
+    # Create an unsharded tenant.
+    tenant_id, timeline_id = env.create_tenant()
+
+    # Write some data.
+    workload = Workload(env, tenant_id, timeline_id, branch_name="main")
+    workload.init()
+    workload.write_rows(256)
+    workload.validate()
+
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(("tenants", str(tenant_id))),
+    )
+
+    # Upload a heatmap as well.
+    env.pageserver.http_client().tenant_heatmap_upload(tenant_id)
+
+    # Split off a few shards, in two rounds.
+    env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)
+    env.storage_controller.tenant_shard_split(tenant_id, shard_count=16)
+
+    # Delete the tenant. This should also delete data for the unsharded and count=4 parents.
+    env.storage_controller.pageserver_api().tenant_delete(tenant_id=tenant_id)
+
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(("tenants", str(tenant_id))),
+        delimiter="",  # match partial prefixes, i.e. all shards
+    )
+
+    dirs = list(env.pageserver.tenant_dir(None).glob(f"{tenant_id}*"))
+    assert dirs == [], f"found tenant directories: {dirs}"
+
+    # The initial tenant created by the test harness should still be there.
+    # Only the tenant we deleted should be removed.
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(("tenants", str(env.initial_tenant))),
+    )
+    dirs = list(env.pageserver.tenant_dir(None).glob(f"{env.initial_tenant}*"))
+    assert dirs != [], "missing initial tenant directory"
+
+    env.stop()

From 299cde899b7b9a31723508afdf7b9e0f0be13912 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 17 Oct 2024 17:19:18 +0200
Subject: [PATCH 41/57] safekeeper: flush WAL on compute disconnect (#9436)

## Problem

In #9259, we found that the `check_safekeepers_synced` fast path could
result in a lower basebackup LSN than the `flush_lsn` reported by
Safekeepers in `VoteResponse`, causing the compute to panic once on
startup.

This would happen if the Safekeeper had unflushed WAL records due to a
compute disconnect. The `TIMELINE_STATUS` query would report a
`flush_lsn` below these unflushed records, while `VoteResponse` would
flush the WAL and report the advanced `flush_lsn`. See
https://github.com/neondatabase/neon/issues/9259#issuecomment-2410849032.

## Summary of changes

Flush the WAL if the compute disconnects during WAL processing.
---
 safekeeper/src/receive_wal.rs | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index e35f806e90..2a9ca85299 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -498,21 +498,18 @@ impl WalAcceptor {
         // we will send keepalives by replying to these requests once per second.
         let mut next_keepalive = Instant::now();
 
-        loop {
-            let opt_msg = self.msg_rx.recv().await;
-            if opt_msg.is_none() {
-                return Ok(()); // chan closed, streaming terminated
-            }
-            let mut next_msg = opt_msg.unwrap();
-
+        while let Some(mut next_msg) = self.msg_rx.recv().await {
             // Update walreceiver state in shmem for reporting.
             if let ProposerAcceptorMessage::Elected(_) = &next_msg {
                 walreceiver_guard.get().status = WalReceiverStatus::Streaming;
             }
 
             let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) {
-                // loop through AppendRequest's while it's readily available to
-                // write as many WAL as possible without fsyncing
+                // Loop through AppendRequests while available to write as many WAL records as
+                // possible without fsyncing.
+                //
+                // Make sure the WAL is flushed before returning, see:
+                // https://github.com/neondatabase/neon/issues/9259
                 //
                 // Note: this will need to be rewritten if we want to read non-AppendRequest messages here.
                 // Otherwise, we might end up in a situation where we read a message, but don't
@@ -522,7 +519,7 @@ impl WalAcceptor {
 
                     if let Some(reply) = self.tli.process_msg(&noflush_msg).await? {
                         if self.reply_tx.send(reply).await.is_err() {
-                            return Ok(()); // chan closed, streaming terminated
+                            break; // disconnected, flush WAL and return on next send/recv
                         }
                     }
 
@@ -531,11 +528,13 @@ impl WalAcceptor {
                         break;
                     }
 
+                    // continue pulling AppendRequests if available
                     match self.msg_rx.try_recv() {
                         Ok(msg) => next_msg = msg,
                         Err(TryRecvError::Empty) => break,
-                        Err(TryRecvError::Disconnected) => return Ok(()), // chan closed, streaming terminated
-                    }
+                        // on disconnect, flush WAL and return on next send/recv
+                        Err(TryRecvError::Disconnected) => break,
+                    };
                 }
 
                 // flush all written WAL to the disk
@@ -555,5 +554,6 @@ impl WalAcceptor {
                 next_keepalive = Instant::now() + KEEPALIVE_INTERVAL;
             }
         }
+        Ok(())
     }
 }

From 858867c62771e7f24c3d33820a8ca87c5f4f146f Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 17 Oct 2024 16:35:19 +0100
Subject: [PATCH 42/57] Add logging of installed_extensions (#9438)

Simple PR to log installed_extensions statistics.

in the following format:
```
2024-10-17T13:53:02.860595Z  INFO [NEON_EXT_STAT] {"extensions":[{"extname":"plpgsql","versions":["1.0"],"n_databases":2},{"extname":"neon","versions":["1.5"],"n_databases":1}]}
```
---
 compute_tools/src/compute.rs              | 28 +++++------------------
 compute_tools/src/installed_extensions.rs | 21 +++++++++++++++++
 2 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 285be56264..6aec008f3a 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -34,6 +34,7 @@ use nix::sys::signal::{kill, Signal};
 use remote_storage::{DownloadError, RemotePath};
 
 use crate::checker::create_availability_check_data;
+use crate::installed_extensions::get_installed_extensions_sync;
 use crate::local_proxy;
 use crate::logger::inlinify;
 use crate::pg_helpers::*;
@@ -1121,6 +1122,11 @@ impl ComputeNode {
                 self.pg_reload_conf()?;
             }
             self.post_apply_config()?;
+
+            let connstr = self.connstr.clone();
+            thread::spawn(move || {
+                get_installed_extensions_sync(connstr).context("get_installed_extensions")
+            });
         }
 
         let startup_end_time = Utc::now();
@@ -1484,28 +1490,6 @@ LIMIT 100",
             info!("Pageserver config changed");
         }
     }
-
-    // Gather info about installed extensions
-    pub fn get_installed_extensions(&self) -> Result<()> {
-        let connstr = self.connstr.clone();
-
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .expect("failed to create runtime");
-        let result = rt
-            .block_on(crate::installed_extensions::get_installed_extensions(
-                connstr,
-            ))
-            .expect("failed to get installed extensions");
-
-        info!(
-            "{}",
-            serde_json::to_string(&result).expect("failed to serialize extensions list")
-        );
-
-        Ok(())
-    }
 }
 
 pub fn forward_termination_signal() {
diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs
index 72578b1f34..877f99bff7 100644
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -1,6 +1,7 @@
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
 use std::collections::HashMap;
 use std::collections::HashSet;
+use tracing::info;
 use url::Url;
 
 use anyhow::Result;
@@ -79,3 +80,23 @@ pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtension
     })
     .await?
 }
+
+// Gather info about installed extensions
+pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .expect("failed to create runtime");
+    let result = rt
+        .block_on(crate::installed_extensions::get_installed_extensions(
+            connstr,
+        ))
+        .expect("failed to get installed extensions");
+
+    info!(
+        "[NEON_EXT_STAT] {}",
+        serde_json::to_string(&result).expect("failed to serialize extensions list")
+    );
+
+    Ok(())
+}

From 63b3491c1b489487e9d94b8499f401cd57e12290 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 17 Oct 2024 12:22:44 -0400
Subject: [PATCH 43/57] refactor(pageserver): remove aux v1 code path (#9424)

Part of the aux v1 retirement
https://github.com/neondatabase/neon/issues/8623

## Summary of changes

Remove write/read path for aux v1, but keeping the config item and the
index part field for now.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/models.rs             |   2 -
 pageserver/src/http/routes.rs                 |  32 --
 pageserver/src/pgdatadir_mapping.rs           | 323 +++------------
 pageserver/src/tenant.rs                      | 380 +-----------------
 .../src/tenant/remote_timeline_client.rs      |  14 +-
 .../tenant/remote_timeline_client/index.rs    |   4 -
 pageserver/src/tenant/timeline.rs             |  51 +--
 pageserver/src/tenant/timeline/delete.rs      |   2 -
 pageserver/src/walredo/apply_neon.rs          |  71 +---
 test_runner/regress/test_aux_files.py         |  78 ----
 10 files changed, 60 insertions(+), 897 deletions(-)
 delete mode 100644 test_runner/regress/test_aux_files.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 3ec9cac2c3..5b0b6bebe3 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -743,8 +743,6 @@ pub struct TimelineInfo {
     // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
     // not deny unknown fields by default so it's safe to set the field to some value, though it won't be
     // read.
-    /// The last aux file policy being used on this timeline
-    pub last_aux_file_policy: Option<AuxFilePolicy>,
     pub is_archived: Option<bool>,
 }
 
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 36a6ed427b..e6663ef56f 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -18,7 +18,6 @@ use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::virtual_file::IoMode;
-use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
 use pageserver_api::models::IngestAuxFilesRequest;
 use pageserver_api::models::ListAuxFilesRequest;
@@ -474,8 +473,6 @@ async fn build_timeline_info_common(
         is_archived: Some(is_archived),
 
         walreceiver_status,
-
-        last_aux_file_policy: timeline.last_aux_file_policy.load(),
     };
     Ok(info)
 }
@@ -2399,31 +2396,6 @@ async fn post_tracing_event_handler(
     json_response(StatusCode::OK, ())
 }
 
-async fn force_aux_policy_switch_handler(
-    mut r: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    check_permission(&r, None)?;
-    let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
-    let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?;
-    let policy: AuxFilePolicy = json_request(&mut r).await?;
-
-    let state = get_state(&r);
-
-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-    let timeline =
-        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
-            .await?;
-    timeline
-        .do_switch_aux_policy(policy)
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn put_io_engine_handler(
     mut r: Request<Body>,
     _cancel: CancellationToken,
@@ -3136,10 +3108,6 @@ pub fn make_router(
         )
         .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
         .put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler))
-        .put(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
-            |r| api_handler(r, force_aux_policy_switch_handler),
-        )
         .get("/v1/utilization", |r| api_handler(r, get_utilization))
         .post(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 900da5beab..f2a11e65c1 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -22,7 +22,6 @@ use pageserver_api::key::{
     CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
 use pageserver_api::keyspace::SparseKeySpace;
-use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -33,7 +32,7 @@ use std::ops::ControlFlow;
 use std::ops::Range;
 use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, info, trace, warn};
+use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
 use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -677,21 +676,6 @@ impl Timeline {
         self.get(CHECKPOINT_KEY, lsn, ctx).await
     }
 
-    async fn list_aux_files_v1(
-        &self,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
-        match self.get(AUX_FILES_KEY, lsn, ctx).await {
-            Ok(buf) => Ok(AuxFilesDirectory::des(&buf)?.files),
-            Err(e) => {
-                // This is expected: historical databases do not have the key.
-                debug!("Failed to get info about AUX files: {}", e);
-                Ok(HashMap::new())
-            }
-        }
-    }
-
     async fn list_aux_files_v2(
         &self,
         lsn: Lsn,
@@ -722,10 +706,7 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<(), PageReconstructError> {
-        let current_policy = self.last_aux_file_policy.load();
-        if let Some(AuxFilePolicy::V2) | Some(AuxFilePolicy::CrossValidation) = current_policy {
-            self.list_aux_files_v2(lsn, ctx).await?;
-        }
+        self.list_aux_files_v2(lsn, ctx).await?;
         Ok(())
     }
 
@@ -734,51 +715,7 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
-        let current_policy = self.last_aux_file_policy.load();
-        match current_policy {
-            Some(AuxFilePolicy::V1) => {
-                let res = self.list_aux_files_v1(lsn, ctx).await?;
-                let empty_str = if res.is_empty() { ", empty" } else { "" };
-                warn!(
-                    "this timeline is using deprecated aux file policy V1 (policy=v1{empty_str})"
-                );
-                Ok(res)
-            }
-            None => {
-                let res = self.list_aux_files_v1(lsn, ctx).await?;
-                if !res.is_empty() {
-                    warn!("this timeline is using deprecated aux file policy V1 (policy=None)");
-                }
-                Ok(res)
-            }
-            Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
-            Some(AuxFilePolicy::CrossValidation) => {
-                let v1_result = self.list_aux_files_v1(lsn, ctx).await;
-                let v2_result = self.list_aux_files_v2(lsn, ctx).await;
-                match (v1_result, v2_result) {
-                    (Ok(v1), Ok(v2)) => {
-                        if v1 != v2 {
-                            tracing::error!(
-                                "unmatched aux file v1 v2 result:\nv1 {v1:?}\nv2 {v2:?}"
-                            );
-                            return Err(PageReconstructError::Other(anyhow::anyhow!(
-                                "unmatched aux file v1 v2 result"
-                            )));
-                        }
-                        Ok(v1)
-                    }
-                    (Ok(_), Err(v2)) => {
-                        tracing::error!("aux file v1 returns Ok while aux file v2 returns an err");
-                        Err(v2)
-                    }
-                    (Err(v1), Ok(_)) => {
-                        tracing::error!("aux file v2 returns Ok while aux file v1 returns an err");
-                        Err(v1)
-                    }
-                    (Err(_), Err(v2)) => Err(v2),
-                }
-            }
-        }
+        self.list_aux_files_v2(lsn, ctx).await
     }
 
     pub(crate) async fn get_replorigins(
@@ -954,9 +891,6 @@ impl Timeline {
 
         result.add_key(CONTROLFILE_KEY);
         result.add_key(CHECKPOINT_KEY);
-        if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
-            result.add_key(AUX_FILES_KEY);
-        }
 
         // Add extra keyspaces in the test cases. Some test cases write keys into the storage without
         // creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace`
@@ -1166,9 +1100,6 @@ impl<'a> DatadirModification<'a> {
         self.pending_directory_entries.push((DirectoryKind::Db, 0));
         self.put(DBDIR_KEY, Value::Image(buf.into()));
 
-        // Create AuxFilesDirectory
-        self.init_aux_dir()?;
-
         let buf = if self.tline.pg_version >= 17 {
             TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 {
                 xids: HashSet::new(),
@@ -1347,9 +1278,6 @@ impl<'a> DatadirModification<'a> {
             // 'true', now write the updated 'dbdirs' map back.
             let buf = DbDirectory::ser(&dbdir)?;
             self.put(DBDIR_KEY, Value::Image(buf.into()));
-
-            // Create AuxFilesDirectory as well
-            self.init_aux_dir()?;
         }
         if r.is_none() {
             // Create RelDirectory
@@ -1726,200 +1654,60 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
-    pub fn init_aux_dir(&mut self) -> anyhow::Result<()> {
-        if let AuxFilePolicy::V2 = self.tline.get_switch_aux_file_policy() {
-            return Ok(());
-        }
-        let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
-            files: HashMap::new(),
-        })?;
-        self.pending_directory_entries
-            .push((DirectoryKind::AuxFiles, 0));
-        self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
-        Ok(())
-    }
-
     pub async fn put_file(
         &mut self,
         path: &str,
         content: &[u8],
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        let switch_policy = self.tline.get_switch_aux_file_policy();
-
-        let policy = {
-            let current_policy = self.tline.last_aux_file_policy.load();
-            // Allowed switch path:
-            // * no aux files -> v1/v2/cross-validation
-            // * cross-validation->v2
-
-            let current_policy = if current_policy.is_none() {
-                // This path will only be hit once per tenant: we will decide the final policy in this code block.
-                // The next call to `put_file` will always have `last_aux_file_policy != None`.
-                let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
-                let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?;
-                if aux_files_key_v1.is_empty() {
-                    None
-                } else {
-                    warn!("this timeline is using deprecated aux file policy V1 (detected existing v1 files)");
-                    self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
-                    Some(AuxFilePolicy::V1)
-                }
-            } else {
-                current_policy
-            };
-
-            if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
-                self.tline.do_switch_aux_policy(switch_policy)?;
-                info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
-                switch_policy
-            } else {
-                // This branch handles non-valid migration path, and the case that switch_policy == current_policy.
-                // And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit.
-                current_policy.unwrap_or(AuxFilePolicy::default_tenant_config())
-            }
+        let key = aux_file::encode_aux_file_key(path);
+        // retrieve the key from the engine
+        let old_val = match self.get(key, ctx).await {
+            Ok(val) => Some(val),
+            Err(PageReconstructError::MissingKey(_)) => None,
+            Err(e) => return Err(e.into()),
         };
-
-        if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
-            let key = aux_file::encode_aux_file_key(path);
-            // retrieve the key from the engine
-            let old_val = match self.get(key, ctx).await {
-                Ok(val) => Some(val),
-                Err(PageReconstructError::MissingKey(_)) => None,
-                Err(e) => return Err(e.into()),
-            };
-            let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
-                aux_file::decode_file_value(old_val)?
+        let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
+            aux_file::decode_file_value(old_val)?
+        } else {
+            Vec::new()
+        };
+        let mut other_files = Vec::with_capacity(files.len());
+        let mut modifying_file = None;
+        for file @ (p, content) in files {
+            if path == p {
+                assert!(
+                    modifying_file.is_none(),
+                    "duplicated entries found for {}",
+                    path
+                );
+                modifying_file = Some(content);
             } else {
-                Vec::new()
-            };
-            let mut other_files = Vec::with_capacity(files.len());
-            let mut modifying_file = None;
-            for file @ (p, content) in files {
-                if path == p {
-                    assert!(
-                        modifying_file.is_none(),
-                        "duplicated entries found for {}",
-                        path
-                    );
-                    modifying_file = Some(content);
-                } else {
-                    other_files.push(file);
-                }
+                other_files.push(file);
             }
-            let mut new_files = other_files;
-            match (modifying_file, content.is_empty()) {
-                (Some(old_content), false) => {
-                    self.tline
-                        .aux_file_size_estimator
-                        .on_update(old_content.len(), content.len());
-                    new_files.push((path, content));
-                }
-                (Some(old_content), true) => {
-                    self.tline
-                        .aux_file_size_estimator
-                        .on_remove(old_content.len());
-                    // not adding the file key to the final `new_files` vec.
-                }
-                (None, false) => {
-                    self.tline.aux_file_size_estimator.on_add(content.len());
-                    new_files.push((path, content));
-                }
-                (None, true) => warn!("removing non-existing aux file: {}", path),
-            }
-            let new_val = aux_file::encode_file_value(&new_files)?;
-            self.put(key, Value::Image(new_val.into()));
         }
-
-        if let AuxFilePolicy::V1 | AuxFilePolicy::CrossValidation = policy {
-            let file_path = path.to_string();
-            let content = if content.is_empty() {
-                None
-            } else {
-                Some(Bytes::copy_from_slice(content))
-            };
-
-            let n_files;
-            let mut aux_files = self.tline.aux_files.lock().await;
-            if let Some(mut dir) = aux_files.dir.take() {
-                // We already updated aux files in `self`: emit a delta and update our latest value.
-                dir.upsert(file_path.clone(), content.clone());
-                n_files = dir.files.len();
-                if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
-                    self.put(
-                        AUX_FILES_KEY,
-                        Value::Image(Bytes::from(
-                            AuxFilesDirectory::ser(&dir).context("serialize")?,
-                        )),
-                    );
-                    aux_files.n_deltas = 0;
-                } else {
-                    self.put(
-                        AUX_FILES_KEY,
-                        Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
-                    );
-                    aux_files.n_deltas += 1;
-                }
-                aux_files.dir = Some(dir);
-            } else {
-                // Check if the AUX_FILES_KEY is initialized
-                match self.get(AUX_FILES_KEY, ctx).await {
-                    Ok(dir_bytes) => {
-                        let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
-                        // Key is already set, we may append a delta
-                        self.put(
-                            AUX_FILES_KEY,
-                            Value::WalRecord(NeonWalRecord::AuxFile {
-                                file_path: file_path.clone(),
-                                content: content.clone(),
-                            }),
-                        );
-                        dir.upsert(file_path, content);
-                        n_files = dir.files.len();
-                        aux_files.dir = Some(dir);
-                    }
-                    Err(
-                        e @ (PageReconstructError::Cancelled
-                        | PageReconstructError::AncestorLsnTimeout(_)),
-                    ) => {
-                        // Important that we do not interpret a shutdown error as "not found" and thereby
-                        // reset the map.
-                        return Err(e.into());
-                    }
-                    // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
-                    // the original code assumes all other errors are missing keys. Therefore, we keep the code path
-                    // the same for now, though in theory, we should only match the `MissingKey` variant.
-                    Err(
-                        e @ (PageReconstructError::Other(_)
-                        | PageReconstructError::WalRedo(_)
-                        | PageReconstructError::MissingKey(_)),
-                    ) => {
-                        // Key is missing, we must insert an image as the basis for subsequent deltas.
-
-                        if !matches!(e, PageReconstructError::MissingKey(_)) {
-                            let e = utils::error::report_compact_sources(&e);
-                            tracing::warn!("treating error as if it was a missing key: {}", e);
-                        }
-
-                        let mut dir = AuxFilesDirectory {
-                            files: HashMap::new(),
-                        };
-                        dir.upsert(file_path, content);
-                        self.put(
-                            AUX_FILES_KEY,
-                            Value::Image(Bytes::from(
-                                AuxFilesDirectory::ser(&dir).context("serialize")?,
-                            )),
-                        );
-                        n_files = 1;
-                        aux_files.dir = Some(dir);
-                    }
-                }
+        let mut new_files = other_files;
+        match (modifying_file, content.is_empty()) {
+            (Some(old_content), false) => {
+                self.tline
+                    .aux_file_size_estimator
+                    .on_update(old_content.len(), content.len());
+                new_files.push((path, content));
             }
-
-            self.pending_directory_entries
-                .push((DirectoryKind::AuxFiles, n_files));
+            (Some(old_content), true) => {
+                self.tline
+                    .aux_file_size_estimator
+                    .on_remove(old_content.len());
+                // not adding the file key to the final `new_files` vec.
+            }
+            (None, false) => {
+                self.tline.aux_file_size_estimator.on_add(content.len());
+                new_files.push((path, content));
+            }
+            (None, true) => warn!("removing non-existing aux file: {}", path),
         }
+        let new_val = aux_file::encode_file_value(&new_files)?;
+        self.put(key, Value::Image(new_val.into()));
 
         Ok(())
     }
@@ -2089,12 +1877,6 @@ impl<'a> DatadirModification<'a> {
         self.tline.get(key, lsn, ctx).await
     }
 
-    /// Only used during unit tests, force putting a key into the modification.
-    #[cfg(test)]
-    pub(crate) fn put_for_test(&mut self, key: Key, val: Value) {
-        self.put(key, val);
-    }
-
     fn put(&mut self, key: Key, val: Value) {
         if Self::is_data_key(&key) {
             self.put_data(key.to_compact(), val)
@@ -2212,21 +1994,6 @@ struct RelDirectory {
     rels: HashSet<(Oid, u8)>,
 }
 
-#[derive(Debug, Serialize, Deserialize, Default, PartialEq)]
-pub(crate) struct AuxFilesDirectory {
-    pub(crate) files: HashMap<String, Bytes>,
-}
-
-impl AuxFilesDirectory {
-    pub(crate) fn upsert(&mut self, key: String, value: Option<Bytes>) {
-        if let Some(value) = value {
-            self.files.insert(key, value);
-        } else {
-            self.files.remove(&key);
-        }
-    }
-}
-
 #[derive(Debug, Serialize, Deserialize)]
 struct RelSizeEntry {
     nblocks: u32,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index baa2365658..1066d165cd 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -20,7 +20,6 @@ use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use pageserver_api::models;
-use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::LsnLease;
 use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::models::TimelineState;
@@ -800,7 +799,6 @@ impl Tenant {
         index_part: Option<IndexPart>,
         metadata: TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
-        last_aux_file_policy: Option<AuxFilePolicy>,
         _ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let tenant_id = self.tenant_shard_id;
@@ -811,10 +809,6 @@ impl Tenant {
             ancestor.clone(),
             resources,
             CreateTimelineCause::Load,
-            // This could be derived from ancestor branch + index part. Though the only caller of `timeline_init_and_sync` is `load_remote_timeline`,
-            // there will potentially be other caller of this function in the future, and we don't know whether `index_part` or `ancestor` takes precedence.
-            // Therefore, we pass this field explicitly for now, and remove it once we fully migrate to aux file v2.
-            last_aux_file_policy,
         )?;
         let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
         anyhow::ensure!(
@@ -829,10 +823,6 @@ impl Tenant {
 
         if let Some(index_part) = index_part.as_ref() {
             timeline.remote_client.init_upload_queue(index_part)?;
-
-            timeline
-                .last_aux_file_policy
-                .store(index_part.last_aux_file_policy());
         } else {
             // No data on the remote storage, but we have local metadata file. We can end up
             // here with timeline_create being interrupted before finishing index part upload.
@@ -1403,15 +1393,12 @@ impl Tenant {
             None
         };
 
-        let last_aux_file_policy = index_part.last_aux_file_policy();
-
         self.timeline_init_and_sync(
             timeline_id,
             resources,
             Some(index_part),
             remote_metadata,
             ancestor,
-            last_aux_file_policy,
             ctx,
         )
         .await
@@ -1824,7 +1811,6 @@ impl Tenant {
             create_guard,
             initdb_lsn,
             None,
-            None,
         )
         .await
     }
@@ -3032,7 +3018,6 @@ impl Tenant {
         ancestor: Option<Arc<Timeline>>,
         resources: TimelineResources,
         cause: CreateTimelineCause,
-        last_aux_file_policy: Option<AuxFilePolicy>,
     ) -> anyhow::Result<Arc<Timeline>> {
         let state = match cause {
             CreateTimelineCause::Load => {
@@ -3061,7 +3046,6 @@ impl Tenant {
             resources,
             pg_version,
             state,
-            last_aux_file_policy,
             self.attach_wal_lag_cooldown.clone(),
             self.cancel.child_token(),
         );
@@ -3720,7 +3704,6 @@ impl Tenant {
                 timeline_create_guard,
                 start_lsn + 1,
                 Some(Arc::clone(src_timeline)),
-                src_timeline.last_aux_file_policy.load(),
             )
             .await?;
 
@@ -3914,7 +3897,6 @@ impl Tenant {
                 timeline_create_guard,
                 pgdata_lsn,
                 None,
-                None,
             )
             .await?;
 
@@ -3986,7 +3968,6 @@ impl Tenant {
         create_guard: TimelineCreateGuard<'a>,
         start_lsn: Lsn,
         ancestor: Option<Arc<Timeline>>,
-        last_aux_file_policy: Option<AuxFilePolicy>,
     ) -> anyhow::Result<UninitializedTimeline<'a>> {
         let tenant_shard_id = self.tenant_shard_id;
 
@@ -4002,7 +3983,6 @@ impl Tenant {
                 ancestor,
                 resources,
                 CreateTimelineCause::Load,
-                last_aux_file_policy,
             )
             .context("Failed to create timeline data structure")?;
 
@@ -4600,7 +4580,6 @@ mod tests {
 
     use super::*;
     use crate::keyspace::KeySpaceAccum;
-    use crate::pgdatadir_mapping::AuxFilesDirectory;
     use crate::repository::{Key, Value};
     use crate::tenant::harness::*;
     use crate::tenant::timeline::CompactFlags;
@@ -4609,7 +4588,7 @@ mod tests {
     use bytes::{Bytes, BytesMut};
     use hex_literal::hex;
     use itertools::Itertools;
-    use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
+    use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
     use pageserver_api::keyspace::KeySpace;
     use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
     use rand::{thread_rng, Rng};
@@ -4618,7 +4597,6 @@ mod tests {
     use tests::timeline::{GetVectoredError, ShutdownMode};
     use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
     use timeline::{DeltaLayerTestDesc, GcInfo};
-    use utils::bin_ser::BeSer;
     use utils::id::TenantId;
 
     static TEST_KEY: Lazy<Key> =
@@ -6422,16 +6400,9 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_branch_copies_dirty_aux_file_flag() {
-        let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag")
-            .await
-            .unwrap();
+    async fn test_aux_file_e2e() {
+        let harness = TenantHarness::create("test_aux_file_e2e").await.unwrap();
 
-        // the default aux file policy to switch is v2 if not set by the admins
-        assert_eq!(
-            harness.tenant_conf.switch_aux_file_policy,
-            AuxFilePolicy::default_tenant_config()
-        );
         let (tenant, ctx) = harness.load().await;
 
         let mut lsn = Lsn(0x08);
@@ -6441,9 +6412,6 @@ mod tests {
             .await
             .unwrap();
 
-        // no aux file is written at this point, so the persistent flag should be unset
-        assert_eq!(tline.last_aux_file_policy.load(), None);
-
         {
             lsn += 8;
             let mut modification = tline.begin_modification(lsn);
@@ -6454,30 +6422,6 @@ mod tests {
             modification.commit(&ctx).await.unwrap();
         }
 
-        // there is no tenant manager to pass the configuration through, so lets mimic it
-        tenant.set_new_location_config(
-            AttachedTenantConf::try_from(LocationConf::attached_single(
-                TenantConfOpt {
-                    switch_aux_file_policy: Some(AuxFilePolicy::V2),
-                    ..Default::default()
-                },
-                tenant.generation,
-                &pageserver_api::models::ShardParameters::default(),
-            ))
-            .unwrap(),
-        );
-
-        assert_eq!(
-            tline.get_switch_aux_file_policy(),
-            AuxFilePolicy::V2,
-            "wanted state has been updated"
-        );
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there"
-        );
-
         // we can read everything from the storage
         let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
         assert_eq!(
@@ -6495,12 +6439,6 @@ mod tests {
             modification.commit(&ctx).await.unwrap();
         }
 
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "keep v2 storage format when new files are written"
-        );
-
         let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
         assert_eq!(
             files.get("pg_logical/mappings/test2"),
@@ -6512,321 +6450,9 @@ mod tests {
             .await
             .unwrap();
 
-        // child copies the last flag even if that is not on remote storage yet
-        assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
-        assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
-
         let files = child.list_aux_files(lsn, &ctx).await.unwrap();
         assert_eq!(files.get("pg_logical/mappings/test1"), None);
         assert_eq!(files.get("pg_logical/mappings/test2"), None);
-
-        // even if we crash here without flushing parent timeline with it's new
-        // last_aux_file_policy we are safe, because child was never meant to access ancestor's
-        // files. the ancestor can even switch back to V1 because of a migration safely.
-    }
-
-    #[tokio::test]
-    async fn aux_file_policy_switch() {
-        let mut harness = TenantHarness::create("aux_file_policy_switch")
-            .await
-            .unwrap();
-        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode
-        let (tenant, ctx) = harness.load().await;
-
-        let mut lsn = Lsn(0x08);
-
-        let tline: Arc<Timeline> = tenant
-            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            None,
-            "no aux file is written so it should be unset"
-        );
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test1", b"first", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        // there is no tenant manager to pass the configuration through, so lets mimic it
-        tenant.set_new_location_config(
-            AttachedTenantConf::try_from(LocationConf::attached_single(
-                TenantConfOpt {
-                    switch_aux_file_policy: Some(AuxFilePolicy::V2),
-                    ..Default::default()
-                },
-                tenant.generation,
-                &pageserver_api::models::ShardParameters::default(),
-            ))
-            .unwrap(),
-        );
-
-        assert_eq!(
-            tline.get_switch_aux_file_policy(),
-            AuxFilePolicy::V2,
-            "wanted state has been updated"
-        );
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::CrossValidation),
-            "dirty index_part.json reflected state is yet to be updated"
-        );
-
-        // we can still read the auxfile v1 before we ingest anything new
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(
-            files.get("pg_logical/mappings/test1"),
-            Some(&bytes::Bytes::from_static(b"first"))
-        );
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test2", b"second", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "ingesting a file should apply the wanted switch state when applicable"
-        );
-
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(
-            files.get("pg_logical/mappings/test1"),
-            Some(&bytes::Bytes::from_static(b"first")),
-            "cross validation writes to both v1 and v2 so this should be available in v2"
-        );
-        assert_eq!(
-            files.get("pg_logical/mappings/test2"),
-            Some(&bytes::Bytes::from_static(b"second"))
-        );
-
-        // mimic again by trying to flip it from V2 to V1 (not switched to while ingesting a file)
-        tenant.set_new_location_config(
-            AttachedTenantConf::try_from(LocationConf::attached_single(
-                TenantConfOpt {
-                    switch_aux_file_policy: Some(AuxFilePolicy::V1),
-                    ..Default::default()
-                },
-                tenant.generation,
-                &pageserver_api::models::ShardParameters::default(),
-            ))
-            .unwrap(),
-        );
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test2", b"third", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        assert_eq!(
-            tline.get_switch_aux_file_policy(),
-            AuxFilePolicy::V1,
-            "wanted state has been updated again, even if invalid request"
-        );
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "ingesting a file should apply the wanted switch state when applicable"
-        );
-
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(
-            files.get("pg_logical/mappings/test1"),
-            Some(&bytes::Bytes::from_static(b"first"))
-        );
-        assert_eq!(
-            files.get("pg_logical/mappings/test2"),
-            Some(&bytes::Bytes::from_static(b"third"))
-        );
-
-        // mimic again by trying to flip it from from V1 to V2 (not switched to while ingesting a file)
-        tenant.set_new_location_config(
-            AttachedTenantConf::try_from(LocationConf::attached_single(
-                TenantConfOpt {
-                    switch_aux_file_policy: Some(AuxFilePolicy::V2),
-                    ..Default::default()
-                },
-                tenant.generation,
-                &pageserver_api::models::ShardParameters::default(),
-            ))
-            .unwrap(),
-        );
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test3", b"last", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        assert_eq!(tline.get_switch_aux_file_policy(), AuxFilePolicy::V2);
-
-        assert_eq!(tline.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
-
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(
-            files.get("pg_logical/mappings/test1"),
-            Some(&bytes::Bytes::from_static(b"first"))
-        );
-        assert_eq!(
-            files.get("pg_logical/mappings/test2"),
-            Some(&bytes::Bytes::from_static(b"third"))
-        );
-        assert_eq!(
-            files.get("pg_logical/mappings/test3"),
-            Some(&bytes::Bytes::from_static(b"last"))
-        );
-    }
-
-    #[tokio::test]
-    async fn aux_file_policy_force_switch() {
-        let mut harness = TenantHarness::create("aux_file_policy_force_switch")
-            .await
-            .unwrap();
-        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1;
-        let (tenant, ctx) = harness.load().await;
-
-        let mut lsn = Lsn(0x08);
-
-        let tline: Arc<Timeline> = tenant
-            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            None,
-            "no aux file is written so it should be unset"
-        );
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test1", b"first", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        tline.do_switch_aux_policy(AuxFilePolicy::V2).unwrap();
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "dirty index_part.json reflected state is yet to be updated"
-        );
-
-        // lose all data from v1
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(files.get("pg_logical/mappings/test1"), None);
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test2", b"second", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        // read data ingested in v2
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(
-            files.get("pg_logical/mappings/test2"),
-            Some(&bytes::Bytes::from_static(b"second"))
-        );
-        // lose all data from v1
-        assert_eq!(files.get("pg_logical/mappings/test1"), None);
-    }
-
-    #[tokio::test]
-    async fn aux_file_policy_auto_detect() {
-        let mut harness = TenantHarness::create("aux_file_policy_auto_detect")
-            .await
-            .unwrap();
-        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode
-        let (tenant, ctx) = harness.load().await;
-
-        let mut lsn = Lsn(0x08);
-
-        let tline: Arc<Timeline> = tenant
-            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            None,
-            "no aux file is written so it should be unset"
-        );
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
-                files: vec![(
-                    "test_file".to_string(),
-                    Bytes::copy_from_slice(b"test_file"),
-                )]
-                .into_iter()
-                .collect(),
-            })
-            .unwrap();
-            modification.put_for_test(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test1", b"first", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V1),
-            "keep using v1 because there are aux files writting with v1"
-        );
-
-        // we can still read the auxfile v1
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(
-            files.get("pg_logical/mappings/test1"),
-            Some(&bytes::Bytes::from_static(b"first"))
-        );
-        assert_eq!(
-            files.get("test_file"),
-            Some(&bytes::Bytes::from_static(b"test_file"))
-        );
     }
 
     #[tokio::test]
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 1f9ae40af5..5e9702bd3d 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -187,7 +187,7 @@ use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};
 
 pub(crate) use download::download_initdb_tar_zst;
-use pageserver_api::models::{AuxFilePolicy, TimelineArchivalState};
+use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
@@ -628,18 +628,6 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
-    /// Launch an index-file upload operation in the background, with only the `aux_file_policy` flag updated.
-    pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
-        self: &Arc<Self>,
-        last_aux_file_policy: Option<AuxFilePolicy>,
-    ) -> anyhow::Result<()> {
-        let mut guard = self.upload_queue.lock().unwrap();
-        let upload_queue = guard.initialized_mut()?;
-        upload_queue.dirty.last_aux_file_policy = last_aux_file_policy;
-        self.schedule_index_upload(upload_queue)?;
-        Ok(())
-    }
-
     /// Launch an index-file upload operation in the background, with only the `archived_at` field updated.
     ///
     /// Returns whether it is required to wait for the queue to be empty to ensure that the change is uploaded,
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index c51ff54919..3a74a4ed11 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -133,10 +133,6 @@ impl IndexPart {
     pub(crate) fn example() -> Self {
         Self::empty(TimelineMetadata::example())
     }
-
-    pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
-        self.last_aux_file_policy
-    }
 }
 
 /// Metadata gathered for each of the layer files.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2b4f949c76..d67a139dfa 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -28,9 +28,9 @@ use pageserver_api::{
     },
     keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
     models::{
-        AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, CompactionAlgorithmSettings,
-        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
-        InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState,
+        CompactionAlgorithm, CompactionAlgorithmSettings, DownloadRemoteLayersTaskInfo,
+        DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
+        LsnLease, TimelineState,
     },
     reltag::BlockNumber,
     shard::{ShardIdentity, ShardNumber, TenantShardId},
@@ -98,12 +98,12 @@ use crate::{
 use crate::{
     metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
-use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
-use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey};
 use crate::{
-    pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
+    pgdatadir_mapping::DirectoryKind,
     virtual_file::{MaybeFatalIo, VirtualFile},
 };
+use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
+use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey};
 use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
 
 use crate::config::PageServerConf;
@@ -206,11 +206,6 @@ pub struct TimelineResources {
     pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }
 
-pub(crate) struct AuxFilesState {
-    pub(crate) dir: Option<AuxFilesDirectory>,
-    pub(crate) n_deltas: usize,
-}
-
 /// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL
 /// ingestion considerably, because WAL ingestion needs to check on most records if the record
 /// implicitly extends the relation.  At startup, `complete_as_of` is initialized to the current end
@@ -413,15 +408,9 @@ pub struct Timeline {
     timeline_get_throttle:
         Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
 
-    /// Keep aux directory cache to avoid it's reconstruction on each update
-    pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
-
     /// Size estimator for aux file v2
     pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
 
-    /// Indicate whether aux file v2 storage is enabled.
-    pub(crate) last_aux_file_policy: AtomicAuxFilePolicy,
-
     /// Some test cases directly place keys into the timeline without actually modifying the directory
     /// keys (i.e., DB_DIR). The test cases creating such keys will put the keyspaces here, so that
     /// these keys won't get garbage-collected during compaction/GC. This field only modifies the dense
@@ -2012,14 +2001,6 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length_for_ts)
     }
 
-    pub(crate) fn get_switch_aux_file_policy(&self) -> AuxFilePolicy {
-        let tenant_conf = self.tenant_conf.load();
-        tenant_conf
-            .tenant_conf
-            .switch_aux_file_policy
-            .unwrap_or(self.conf.default_tenant_conf.switch_aux_file_policy)
-    }
-
     pub(crate) fn get_lazy_slru_download(&self) -> bool {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
@@ -2152,7 +2133,6 @@ impl Timeline {
         resources: TimelineResources,
         pg_version: u32,
         state: TimelineState,
-        aux_file_policy: Option<AuxFilePolicy>,
         attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
         cancel: CancellationToken,
     ) -> Arc<Self> {
@@ -2282,15 +2262,8 @@ impl Timeline {
 
                 timeline_get_throttle: resources.timeline_get_throttle,
 
-                aux_files: tokio::sync::Mutex::new(AuxFilesState {
-                    dir: None,
-                    n_deltas: 0,
-                }),
-
                 aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
 
-                last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy),
-
                 #[cfg(test)]
                 extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
 
@@ -2301,10 +2274,6 @@ impl Timeline {
                 attach_wal_lag_cooldown,
             };
 
-            if aux_file_policy == Some(AuxFilePolicy::V1) {
-                warn!("this timeline is using deprecated aux file policy V1 (when loading the timeline)");
-            }
-
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
 
@@ -4479,14 +4448,6 @@ impl Timeline {
     ) -> Result<(), detach_ancestor::Error> {
         detach_ancestor::complete(self, tenant, attempt, ctx).await
     }
-
-    /// Switch aux file policy and schedule upload to the index part.
-    pub(crate) fn do_switch_aux_policy(&self, policy: AuxFilePolicy) -> anyhow::Result<()> {
-        self.last_aux_file_policy.store(Some(policy));
-        self.remote_client
-            .schedule_index_upload_for_aux_file_policy_update(Some(policy))?;
-        Ok(())
-    }
 }
 
 impl Drop for Timeline {
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 305c5758cc..71b9e4e288 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -283,8 +283,6 @@ impl DeleteTimelineFlow {
                 // Important. We dont pass ancestor above because it can be missing.
                 // Thus we need to skip the validation here.
                 CreateTimelineCause::Delete,
-                // Aux file policy is not needed for deletion, assuming deletion does not read aux keyspace
-                None,
             )
             .context("create_timeline_struct")?;
 
diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
index facf01004c..c067787f97 100644
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -1,8 +1,7 @@
-use crate::pgdatadir_mapping::AuxFilesDirectory;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
-use bytes::{BufMut, BytesMut};
+use bytes::BytesMut;
 use pageserver_api::key::Key;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants;
@@ -13,7 +12,6 @@ use postgres_ffi::v14::nonrelfile_utils::{
 };
 use postgres_ffi::BLCKSZ;
 use tracing::*;
-use utils::bin_ser::BeSer;
 use utils::lsn::Lsn;
 
 /// Can this request be served by neon redo functions
@@ -236,13 +234,9 @@ pub(crate) fn apply_in_neon(
                 LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid);
             }
         }
-        NeonWalRecord::AuxFile { file_path, content } => {
-            let mut dir = AuxFilesDirectory::des(page)?;
-            dir.upsert(file_path.clone(), content.clone());
-
-            page.clear();
-            let mut writer = page.writer();
-            dir.ser_into(&mut writer)?;
+        NeonWalRecord::AuxFile { .. } => {
+            // No-op: this record will never be created in aux v2.
+            warn!("AuxFile record should not be created in aux v2");
         }
         #[cfg(test)]
         NeonWalRecord::Test {
@@ -250,6 +244,7 @@ pub(crate) fn apply_in_neon(
             clear,
             will_init,
         } => {
+            use bytes::BufMut;
             if *will_init {
                 assert!(*clear, "init record must be clear to ensure correctness");
             }
@@ -261,59 +256,3 @@ pub(crate) fn apply_in_neon(
     }
     Ok(())
 }
-
-#[cfg(test)]
-mod test {
-    use bytes::Bytes;
-    use pageserver_api::key::AUX_FILES_KEY;
-
-    use super::*;
-    use std::collections::HashMap;
-
-    /// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile
-    #[test]
-    fn apply_aux_file_deltas() -> anyhow::Result<()> {
-        let base_dir = AuxFilesDirectory {
-            files: HashMap::from([
-                ("two".to_string(), Bytes::from_static(b"content0")),
-                ("three".to_string(), Bytes::from_static(b"contentX")),
-            ]),
-        };
-        let base_image = AuxFilesDirectory::ser(&base_dir)?;
-
-        let deltas = vec![
-            // Insert
-            NeonWalRecord::AuxFile {
-                file_path: "one".to_string(),
-                content: Some(Bytes::from_static(b"content1")),
-            },
-            // Update
-            NeonWalRecord::AuxFile {
-                file_path: "two".to_string(),
-                content: Some(Bytes::from_static(b"content99")),
-            },
-            // Delete
-            NeonWalRecord::AuxFile {
-                file_path: "three".to_string(),
-                content: None,
-            },
-        ];
-
-        let file_path = AUX_FILES_KEY;
-        let mut page = BytesMut::from_iter(base_image);
-
-        for record in deltas {
-            apply_in_neon(&record, Lsn(8), file_path, &mut page)?;
-        }
-
-        let reconstructed = AuxFilesDirectory::des(&page)?;
-        let expect = HashMap::from([
-            ("one".to_string(), Bytes::from_static(b"content1")),
-            ("two".to_string(), Bytes::from_static(b"content99")),
-        ]);
-
-        assert_eq!(reconstructed.files, expect);
-
-        Ok(())
-    }
-}
diff --git a/test_runner/regress/test_aux_files.py b/test_runner/regress/test_aux_files.py
deleted file mode 100644
index 91d674d0db..0000000000
--- a/test_runner/regress/test_aux_files.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from __future__ import annotations
-
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    AuxFileStore,
-    NeonEnvBuilder,
-    logical_replication_sync,
-)
-
-
-def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
-    env = neon_env_builder.init_start()
-    endpoint = env.endpoints.create_start("main")
-    client = env.pageserver.http_client()
-
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-
-    tenant_config = client.tenant_config(tenant_id).effective_config
-    tenant_config["switch_aux_file_policy"] = AuxFileStore.V2
-    client.set_tenant_config(tenant_id, tenant_config)
-    # aux file v2 is enabled on the write path, so for now, it should be unset (or null)
-    assert (
-        client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)["last_aux_file_policy"]
-        is None
-    )
-
-    pg_conn = endpoint.connect()
-    cur = pg_conn.cursor()
-
-    cur.execute("create table t(pk integer primary key, payload integer)")
-    cur.execute(
-        "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120));"
-    )
-    cur.execute("create publication pub1 for table t, replication_example")
-
-    # now start subscriber, aux files will be created at this point. TODO: find better ways of testing aux files (i.e., neon_test_utils)
-    # instead of going through the full logical replication process.
-    vanilla_pg.start()
-    vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)")
-    vanilla_pg.safe_psql(
-        "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120), testcolumn1 int, testcolumn2 int, testcolumn3 int);"
-    )
-    connstr = endpoint.connstr().replace("'", "''")
-    log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
-    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
-
-    # Wait logical replication channel to be established
-    logical_replication_sync(vanilla_pg, endpoint)
-    vanilla_pg.stop()
-    endpoint.stop()
-
-    with env.pageserver.http_client() as client:
-        # aux file v2 flag should be enabled at this point
-        assert (
-            client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"]
-            == AuxFileStore.V2
-        )
-    with env.pageserver.http_client() as client:
-        tenant_config = client.tenant_config(tenant_id).effective_config
-        tenant_config["switch_aux_file_policy"] = "V1"
-        client.set_tenant_config(tenant_id, tenant_config)
-        # the flag should still be enabled
-        assert (
-            client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
-                "last_aux_file_policy"
-            ]
-            == AuxFileStore.V2
-        )
-    env.pageserver.restart()
-    with env.pageserver.http_client() as client:
-        # aux file v2 flag should be persisted
-        assert (
-            client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
-                "last_aux_file_policy"
-            ]
-            == AuxFileStore.V2
-        )

From 24398bf0600223fb74fb3aa33ca4e4374209f84d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 17 Oct 2024 19:02:24 +0100
Subject: [PATCH 44/57] pageserver: detect & warn on loading an old index which
 is probably the result of a bad generation (#9383)

## Problem

The pageserver generally trusts the storage controller/control plane to
give it valid generations. However, sometimes it should be obvious that
a generation is bad, and for defense in depth we should detect that on
the pageserver.

This PR is part 1 of 2:
1. in this PR we detect and warn on such situations, but do not block
starting up the tenant. Once we have confidence that the check is not
firing unexpectedly in the field
2. part 2 of 2 will introduce a condition that refuses to start a tenant
in this situtation, and a test for that (maybe, if we can figure out how
to spoof an ancient mtime)

Related: #6951

## Summary of changes

- When loading an index older than 2 weeks, log an INFO message noting
that we will check for other indices
- When loading an index older than 2 weeks _and_ a newer-generation
index exists, log a warning.
---
 pageserver/src/http/routes.rs                 |  2 +-
 .../src/tenant/remote_timeline_client.rs      | 45 ++++++++++++++++++-
 .../tenant/remote_timeline_client/download.rs | 11 ++---
 3 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index e6663ef56f..8f928fd81b 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2251,7 +2251,7 @@ async fn tenant_scan_remote_handler(
                          %timeline_id))
             .await
             {
-                Ok((index_part, index_generation)) => {
+                Ok((index_part, index_generation, _index_mtime)) => {
                     tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)",
                         index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn());
                     generation = std::cmp::max(generation, index_generation);
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 5e9702bd3d..450084aca2 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -505,7 +505,7 @@ impl RemoteTimelineClient {
             },
         );
 
-        let (index_part, _index_generation) = download::download_index_part(
+        let (index_part, index_generation, index_last_modified) = download::download_index_part(
             &self.storage_impl,
             &self.tenant_shard_id,
             &self.timeline_id,
@@ -519,6 +519,49 @@ impl RemoteTimelineClient {
         )
         .await?;
 
+        // Defense in depth: monotonicity of generation numbers is an important correctness guarantee, so when we see a very
+        // old index, we do extra checks in case this is the result of backward time-travel of the generation number (e.g.
+        // in case of a bug in the service that issues generation numbers). Indices are allowed to be old, but we expect that
+        // when we load an old index we are loading the _latest_ index: if we are asked to load an old index and there is
+        // also a newer index available, that is surprising.
+        const INDEX_AGE_CHECKS_THRESHOLD: Duration = Duration::from_secs(14 * 24 * 3600);
+        let index_age = index_last_modified.elapsed().unwrap_or_else(|e| {
+            if e.duration() > Duration::from_secs(5) {
+                // We only warn if the S3 clock and our local clock are >5s out: because this is a low resolution
+                // timestamp, it is common to be out by at least 1 second.
+                tracing::warn!("Index has modification time in the future: {e}");
+            }
+            Duration::ZERO
+        });
+        if index_age > INDEX_AGE_CHECKS_THRESHOLD {
+            tracing::info!(
+                ?index_generation,
+                age = index_age.as_secs_f64(),
+                "Loaded an old index, checking for other indices..."
+            );
+
+            // Find the highest-generation index
+            let (_latest_index_part, latest_index_generation, latest_index_mtime) =
+                download::download_index_part(
+                    &self.storage_impl,
+                    &self.tenant_shard_id,
+                    &self.timeline_id,
+                    Generation::MAX,
+                    cancel,
+                )
+                .await?;
+
+            if latest_index_generation > index_generation {
+                // Unexpected!  Why are we loading such an old index if a more recent one exists?
+                tracing::warn!(
+                    ?index_generation,
+                    ?latest_index_generation,
+                    ?latest_index_mtime,
+                    "Found a newer index while loading an old one"
+                );
+            }
+        }
+
         if index_part.deleted_at.is_some() {
             Ok(MaybeDeletedIndexPart::Deleted(index_part))
         } else {
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 692e4d3096..b5d4b0f0bb 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -6,6 +6,7 @@
 use std::collections::HashSet;
 use std::future::Future;
 use std::str::FromStr;
+use std::time::SystemTime;
 
 use anyhow::{anyhow, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -343,10 +344,10 @@ async fn do_download_index_part(
     timeline_id: &TimelineId,
     index_generation: Generation,
     cancel: &CancellationToken,
-) -> Result<(IndexPart, Generation), DownloadError> {
+) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
 
-    let index_part_bytes = download_retry_forever(
+    let (index_part_bytes, index_part_mtime) = download_retry_forever(
         || async {
             let download = storage
                 .download(&remote_path, &DownloadOpts::default(), cancel)
@@ -359,7 +360,7 @@ async fn do_download_index_part(
 
             tokio::io::copy_buf(&mut stream, &mut bytes).await?;
 
-            Ok(bytes)
+            Ok((bytes, download.last_modified))
         },
         &format!("download {remote_path:?}"),
         cancel,
@@ -370,7 +371,7 @@ async fn do_download_index_part(
         .with_context(|| format!("deserialize index part file at {remote_path:?}"))
         .map_err(DownloadError::Other)?;
 
-    Ok((index_part, index_generation))
+    Ok((index_part, index_generation, index_part_mtime))
 }
 
 /// index_part.json objects are suffixed with a generation number, so we cannot
@@ -385,7 +386,7 @@ pub(crate) async fn download_index_part(
     timeline_id: &TimelineId,
     my_generation: Generation,
     cancel: &CancellationToken,
-) -> Result<(IndexPart, Generation), DownloadError> {
+) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
     if my_generation.is_none() {

From 928d98b6dcb57ae22a3da18fc6786b90c8dcae0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 17 Oct 2024 21:25:51 +0200
Subject: [PATCH 45/57] Update Rust to 1.82.0 and mold to 2.34.0 (#9445)

We keep the practice of keeping the compiler up to date, pointing to the
latest release. This is done by many other projects in the Rust
ecosystem as well.

[Release notes](https://github.com/rust-lang/rust/blob/master/RELEASES.md#version-1820-2024-10-17).

Also update mold. [release notes for
2.34.0](https://github.com/rui314/mold/releases/tag/v2.34.0), [release
notes for 2.34.1](https://github.com/rui314/mold/releases/tag/v2.34.1).

Prior update was in #8939.
---
 Dockerfile.build-tools | 6 +++---
 rust-toolchain.toml    | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index 7cba1c8635..f05c60661c 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -72,7 +72,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
     && mv s5cmd /usr/local/bin/s5cmd
 
 # LLVM
-ENV LLVM_VERSION=18
+ENV LLVM_VERSION=19
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
     && echo "deb http://apt.llvm.org/${DEBIAN_VERSION}/ llvm-toolchain-${DEBIAN_VERSION}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
     && apt update \
@@ -99,7 +99,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
     && rm awscliv2.zip
 
 # Mold: A Modern Linker
-ENV MOLD_VERSION=v2.33.0
+ENV MOLD_VERSION=v2.34.1
 RUN set -e \
     && git clone https://github.com/rui314/mold.git \
     && mkdir mold/build \
@@ -192,7 +192,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.81.0
+ENV RUSTC_VERSION=1.82.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 3c5d0b12a6..92b7929c7f 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.81.0"
+channel = "1.82.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From d762ad0883f204dee1b15729db8a6a3d6d5497e5 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 17 Oct 2024 20:45:37 +0100
Subject: [PATCH 46/57] update rustls (#9396)

The forever ongoing effort of juggling multiple versions of rustls :3

now with new crypto library aws-lc.

Because of dependencies, it is currently impossible to not have both
ring and aws-lc in the dep tree, therefore our only options are not
updating rustls or having both crypto backends enabled...

According to benchmarks run by the rustls maintainer, aws-lc is faster
than ring in some cases too <https://jbp.io/graviola/>, so it's not
without its upsides,
---
 Cargo.lock                                    | 220 +++++++++++++-----
 Cargo.toml                                    |  12 +-
 libs/postgres_backend/tests/simple_select.rs  |  29 ++-
 proxy/src/bin/pg_sni_router.rs                |  10 +-
 proxy/src/compute.rs                          |  30 ++-
 proxy/src/config.rs                           |  14 +-
 proxy/src/proxy/tests/mod.rs                  |  51 ++--
 .../src/scan_safekeeper_metadata.rs           |  22 +-
 workspace_hack/Cargo.toml                     |  11 +-
 9 files changed, 276 insertions(+), 123 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 6b212bac2e..ad29fa4634 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -148,9 +148,9 @@ dependencies = [
 
 [[package]]
 name = "asn1-rs"
-version = "0.5.2"
+version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f6fd5ddaf0351dff5b8da21b2fb4ff8e08ddd02857f0bf69c47639106c0fff0"
+checksum = "5493c3bedbacf7fd7382c6346bbd66687d12bbaad3a89a2d2c303ee6cf20b048"
 dependencies = [
  "asn1-rs-derive",
  "asn1-rs-impl",
@@ -164,25 +164,25 @@ dependencies = [
 
 [[package]]
 name = "asn1-rs-derive"
-version = "0.4.0"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "726535892e8eae7e70657b4c8ea93d26b8553afb1ce617caee529ef96d7dee6c"
+checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.52",
  "synstructure",
 ]
 
 [[package]]
 name = "asn1-rs-impl"
-version = "0.1.0"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2777730b2039ac0f95f093556e61b6d26cebed5393ca6f152717777cec3a42ed"
+checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -310,6 +310,33 @@ dependencies = [
  "zeroize",
 ]
 
+[[package]]
+name = "aws-lc-rs"
+version = "1.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2f95446d919226d587817a7d21379e6eb099b97b45110a7f272a444ca5c54070"
+dependencies = [
+ "aws-lc-sys",
+ "mirai-annotations",
+ "paste",
+ "zeroize",
+]
+
+[[package]]
+name = "aws-lc-sys"
+version = "0.21.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b3ddc4a5b231dd6958b140ff3151b6412b3f4321fab354f399eec8f14b06df62"
+dependencies = [
+ "bindgen 0.69.5",
+ "cc",
+ "cmake",
+ "dunce",
+ "fs_extra",
+ "libc",
+ "paste",
+]
+
 [[package]]
 name = "aws-runtime"
 version = "1.4.3"
@@ -595,7 +622,7 @@ dependencies = [
  "once_cell",
  "pin-project-lite",
  "pin-utils",
- "rustls 0.21.11",
+ "rustls 0.21.12",
  "tokio",
  "tracing",
 ]
@@ -915,6 +942,29 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "bindgen"
+version = "0.69.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
+dependencies = [
+ "bitflags 2.4.1",
+ "cexpr",
+ "clang-sys",
+ "itertools 0.10.5",
+ "lazy_static",
+ "lazycell",
+ "log",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash",
+ "shlex",
+ "syn 2.0.52",
+ "which",
+]
+
 [[package]]
 name = "bindgen"
 version = "0.70.1"
@@ -924,7 +974,7 @@ dependencies = [
  "bitflags 2.4.1",
  "cexpr",
  "clang-sys",
- "itertools 0.12.1",
+ "itertools 0.10.5",
  "log",
  "prettyplease",
  "proc-macro2",
@@ -1038,12 +1088,13 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 
 [[package]]
 name = "cc"
-version = "1.0.83"
+version = "1.1.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0"
+checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945"
 dependencies = [
  "jobserver",
  "libc",
+ "shlex",
 ]
 
 [[package]]
@@ -1169,6 +1220,15 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
 
+[[package]]
+name = "cmake"
+version = "0.1.51"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "colorchoice"
 version = "1.0.0"
@@ -1624,9 +1684,9 @@ dependencies = [
 
 [[package]]
 name = "der-parser"
-version = "8.2.0"
+version = "9.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dbd676fbbab537128ef0278adb5576cf363cff6aa22a7b24effe97347cfab61e"
+checksum = "5cd0a5c643689626bec213c4d8bd4d96acc8ffdb4ad4bb6bc16abf27d5f4b553"
 dependencies = [
  "asn1-rs",
  "displaydoc",
@@ -1755,6 +1815,12 @@ dependencies = [
  "syn 2.0.52",
 ]
 
+[[package]]
+name = "dunce"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
+
 [[package]]
 name = "dyn-clone"
 version = "1.0.14"
@@ -2059,6 +2125,12 @@ dependencies = [
  "tokio-util",
 ]
 
+[[package]]
+name = "fs_extra"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
+
 [[package]]
 name = "fsevent-sys"
 version = "4.1.0"
@@ -2412,6 +2484,15 @@ dependencies = [
  "digest",
 ]
 
+[[package]]
+name = "home"
+version = "0.5.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "hostname"
 version = "0.4.0"
@@ -2581,7 +2662,7 @@ dependencies = [
  "http 0.2.9",
  "hyper 0.14.30",
  "log",
- "rustls 0.21.11",
+ "rustls 0.21.12",
  "rustls-native-certs 0.6.2",
  "tokio",
  "tokio-rustls 0.24.0",
@@ -2801,9 +2882,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
 
 [[package]]
 name = "jobserver"
-version = "0.1.26"
+version = "0.1.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
+checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0"
 dependencies = [
  "libc",
 ]
@@ -2907,6 +2988,12 @@ dependencies = [
  "spin",
 ]
 
+[[package]]
+name = "lazycell"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
+
 [[package]]
 name = "libc"
 version = "0.2.150"
@@ -3137,6 +3224,12 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "mirai-annotations"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c9be0862c1b3f26a88803c4a49de6889c10e608b3ee9344e6ef5b45fb37ad3d1"
+
 [[package]]
 name = "multimap"
 version = "0.8.3"
@@ -3356,9 +3449,9 @@ dependencies = [
 
 [[package]]
 name = "oid-registry"
-version = "0.6.1"
+version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9bedf36ffb6ba96c2eb7144ef6270557b52e54b20c0a8e1eb2ff99a6c6959bff"
+checksum = "a8d8034d9489cdaf79228eb9f6a3b8d7bb32ba00d6645ebd48eef4077ceb5bd9"
 dependencies = [
  "asn1-rs",
 ]
@@ -4053,14 +4146,14 @@ dependencies = [
  "bytes",
  "once_cell",
  "pq_proto",
- "rustls 0.22.4",
+ "rustls 0.23.7",
  "rustls-pemfile 2.1.1",
  "serde",
  "thiserror",
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
- "tokio-rustls 0.25.0",
+ "tokio-rustls 0.26.0",
  "tokio-util",
  "tracing",
 ]
@@ -4082,7 +4175,7 @@ name = "postgres_ffi"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "bindgen",
+ "bindgen 0.70.1",
  "bytes",
  "crc32c",
  "env_logger",
@@ -4219,7 +4312,7 @@ checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
 dependencies = [
  "bytes",
  "heck 0.5.0",
- "itertools 0.12.1",
+ "itertools 0.10.5",
  "log",
  "multimap",
  "once_cell",
@@ -4239,7 +4332,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5"
 dependencies = [
  "anyhow",
- "itertools 0.12.1",
+ "itertools 0.10.5",
  "proc-macro2",
  "quote",
  "syn 2.0.52",
@@ -4327,8 +4420,8 @@ dependencies = [
  "rsa",
  "rstest",
  "rustc-hash",
- "rustls 0.22.4",
- "rustls-native-certs 0.7.0",
+ "rustls 0.23.7",
+ "rustls-native-certs 0.8.0",
  "rustls-pemfile 2.1.1",
  "scopeguard",
  "serde",
@@ -4345,7 +4438,7 @@ dependencies = [
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
- "tokio-rustls 0.25.0",
+ "tokio-rustls 0.26.0",
  "tokio-tungstenite",
  "tokio-util",
  "tracing",
@@ -4509,12 +4602,13 @@ dependencies = [
 
 [[package]]
 name = "rcgen"
-version = "0.12.1"
+version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1"
+checksum = "54077e1872c46788540de1ea3d7f4ccb1983d12f9aa909b234468676c1a36779"
 dependencies = [
  "pem",
  "ring",
+ "rustls-pki-types",
  "time",
  "yasna",
 ]
@@ -4693,7 +4787,7 @@ dependencies = [
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
- "rustls 0.21.11",
+ "rustls 0.21.12",
  "rustls-pemfile 1.0.2",
  "serde",
  "serde_json",
@@ -4991,9 +5085,9 @@ dependencies = [
 
 [[package]]
 name = "rustls"
-version = "0.21.11"
+version = "0.21.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4"
+checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e"
 dependencies = [
  "log",
  "ring",
@@ -5021,6 +5115,7 @@ version = "0.23.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ebbbdb961df0ad3f2652da8f3fdc4b36122f568f968f45ad3316f26c025c677b"
 dependencies = [
+ "aws-lc-rs",
  "log",
  "once_cell",
  "ring",
@@ -5089,9 +5184,9 @@ dependencies = [
 
 [[package]]
 name = "rustls-pki-types"
-version = "1.3.1"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8"
+checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b"
 
 [[package]]
 name = "rustls-webpki"
@@ -5109,6 +5204,7 @@ version = "0.102.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610"
 dependencies = [
+ "aws-lc-rs",
  "ring",
  "rustls-pki-types",
  "untrusted",
@@ -5312,7 +5408,7 @@ checksum = "00421ed8fa0c995f07cde48ba6c89e80f2b312f74ff637326f392fbfd23abe02"
 dependencies = [
  "httpdate",
  "reqwest 0.12.4",
- "rustls 0.21.11",
+ "rustls 0.21.12",
  "sentry-backtrace",
  "sentry-contexts",
  "sentry-core",
@@ -5807,8 +5903,8 @@ dependencies = [
  "postgres_ffi",
  "remote_storage",
  "reqwest 0.12.4",
- "rustls 0.22.4",
- "rustls-native-certs 0.7.0",
+ "rustls 0.23.7",
+ "rustls-native-certs 0.8.0",
  "serde",
  "serde_json",
  "storage_controller_client",
@@ -5930,14 +6026,13 @@ checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394"
 
 [[package]]
 name = "synstructure"
-version = "0.12.6"
+version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f"
+checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 1.0.109",
- "unicode-xid",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -6236,16 +6331,15 @@ dependencies = [
 
 [[package]]
 name = "tokio-postgres-rustls"
-version = "0.11.1"
+version = "0.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
+checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab"
 dependencies = [
- "futures",
  "ring",
- "rustls 0.22.4",
+ "rustls 0.23.7",
  "tokio",
  "tokio-postgres",
- "tokio-rustls 0.25.0",
+ "tokio-rustls 0.26.0",
  "x509-certificate",
 ]
 
@@ -6255,7 +6349,7 @@ version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5"
 dependencies = [
- "rustls 0.21.11",
+ "rustls 0.21.12",
  "tokio",
 ]
 
@@ -6678,16 +6772,15 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
 
 [[package]]
 name = "ureq"
-version = "2.9.7"
+version = "2.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d11a831e3c0b56e438a28308e7c810799e3c118417f342d30ecec080105395cd"
+checksum = "b74fc6b57825be3373f7054754755f03ac3a8f5d70015ccad699ba2029956f4a"
 dependencies = [
  "base64 0.22.1",
  "log",
  "once_cell",
- "rustls 0.22.4",
+ "rustls 0.23.7",
  "rustls-pki-types",
- "rustls-webpki 0.102.2",
  "url",
  "webpki-roots 0.26.1",
 ]
@@ -6876,7 +6969,7 @@ name = "walproposer"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "bindgen",
+ "bindgen 0.70.1",
  "postgres_ffi",
  "utils",
 ]
@@ -7051,6 +7144,18 @@ dependencies = [
  "rustls-pki-types",
 ]
 
+[[package]]
+name = "which"
+version = "4.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
+dependencies = [
+ "either",
+ "home",
+ "once_cell",
+ "rustix",
+]
+
 [[package]]
 name = "whoami"
 version = "1.5.1"
@@ -7295,7 +7400,6 @@ dependencies = [
  "digest",
  "either",
  "fail",
- "futures",
  "futures-channel",
  "futures-executor",
  "futures-io",
@@ -7311,7 +7415,7 @@ dependencies = [
  "hyper-util",
  "indexmap 1.9.3",
  "indexmap 2.0.1",
- "itertools 0.12.1",
+ "itertools 0.10.5",
  "lazy_static",
  "libc",
  "log",
@@ -7332,6 +7436,8 @@ dependencies = [
  "regex-automata 0.4.3",
  "regex-syntax 0.8.2",
  "reqwest 0.12.4",
+ "rustls 0.23.7",
+ "rustls-webpki 0.102.2",
  "scopeguard",
  "serde",
  "serde_json",
@@ -7340,7 +7446,6 @@ dependencies = [
  "smallvec",
  "spki 0.7.3",
  "subtle",
- "syn 1.0.109",
  "syn 2.0.52",
  "sync_wrapper 0.1.2",
  "tikv-jemalloc-sys",
@@ -7348,6 +7453,7 @@ dependencies = [
  "time-macros",
  "tokio",
  "tokio-postgres",
+ "tokio-rustls 0.26.0",
  "tokio-stream",
  "tokio-util",
  "toml_edit",
@@ -7383,9 +7489,9 @@ dependencies = [
 
 [[package]]
 name = "x509-parser"
-version = "0.15.0"
+version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bab0c2f54ae1d92f4fcb99c0b7ccf0b1e3451cbd395e5f115ccbdbcb18d4f634"
+checksum = "fcbc162f30700d6f3f82a24bf7cc62ffe7caea42c0b2cba8bf7f3ae50cf51f69"
 dependencies = [
  "asn1-rs",
  "data-encoding",
diff --git a/Cargo.toml b/Cargo.toml
index a1a974b33b..4c6a24ecde 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -142,7 +142,7 @@ reqwest-retry = "0.5"
 routerify = "3"
 rpds = "0.13"
 rustc-hash = "1.1.0"
-rustls = "0.22"
+rustls = "0.23"
 rustls-pemfile = "2"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
@@ -172,8 +172,8 @@ tikv-jemalloc-ctl = "0.5"
 tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
-tokio-postgres-rustls = "0.11.0"
-tokio-rustls = "0.25"
+tokio-postgres-rustls = "0.12.0"
+tokio-rustls = "0.26"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
@@ -192,8 +192,8 @@ url = "2.2"
 urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
-rustls-native-certs = "0.7"
-x509-parser = "0.15"
+rustls-native-certs = "0.8"
+x509-parser = "0.16"
 whoami = "1.5.1"
 
 ## TODO replace this with tracing
@@ -244,7 +244,7 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }
 
 ## Build dependencies
 criterion = "0.5.1"
-rcgen = "0.12"
+rcgen = "0.13"
 rstest = "0.18"
 camino-tempfile = "1.0.2"
 tonic-build = "0.12"
diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs
index 900083ea7f..9d3031d699 100644
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -2,6 +2,7 @@
 use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
 use pq_proto::{BeMessage, RowDescriptor};
+use rustls::crypto::aws_lc_rs;
 use std::io::Cursor;
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -92,10 +93,13 @@ static CERT: Lazy<rustls::pki_types::CertificateDer<'static>> = Lazy::new(|| {
 async fn simple_select_ssl() {
     let (client_sock, server_sock) = make_tcp_pair().await;
 
-    let server_cfg = rustls::ServerConfig::builder()
-        .with_no_client_auth()
-        .with_single_cert(vec![CERT.clone()], KEY.clone_key())
-        .unwrap();
+    let server_cfg =
+        rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+            .with_safe_default_protocol_versions()
+            .expect("aws_lc_rs should support the default protocol versions")
+            .with_no_client_auth()
+            .with_single_cert(vec![CERT.clone()], KEY.clone_key())
+            .unwrap();
     let tls_config = Some(Arc::new(server_cfg));
     let pgbackend =
         PostgresBackend::new(server_sock, AuthType::Trust, tls_config).expect("pgbackend creation");
@@ -105,13 +109,16 @@ async fn simple_select_ssl() {
         pgbackend.run(&mut handler, &CancellationToken::new()).await
     });
 
-    let client_cfg = rustls::ClientConfig::builder()
-        .with_root_certificates({
-            let mut store = rustls::RootCertStore::empty();
-            store.add(CERT.clone()).unwrap();
-            store
-        })
-        .with_no_client_auth();
+    let client_cfg =
+        rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+            .with_safe_default_protocol_versions()
+            .expect("aws_lc_rs should support the default protocol versions")
+            .with_root_certificates({
+                let mut store = rustls::RootCertStore::empty();
+                store.add(CERT.clone()).unwrap();
+                store
+            })
+            .with_no_client_auth();
     let mut make_tls_connect = tokio_postgres_rustls::MakeRustlsConnect::new(client_cfg);
     let tls_connect = <MakeRustlsConnect as MakeTlsConnect<TcpStream>>::make_tls_connect(
         &mut make_tls_connect,
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 00eb830d98..13b7fdd40a 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -15,6 +15,7 @@ use proxy::context::RequestMonitoring;
 use proxy::metrics::{Metrics, ThreadPoolMetrics};
 use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
 use proxy::stream::{PqStream, Stream};
+use rustls::crypto::aws_lc_rs;
 use rustls::pki_types::PrivateKeyDer;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::TcpListener;
@@ -104,10 +105,11 @@ async fn main() -> anyhow::Result<()> {
             let first_cert = cert_chain.first().context("missing certificate")?;
             let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
 
-            let tls_config = rustls::ServerConfig::builder_with_protocol_versions(&[
-                &rustls::version::TLS13,
-                &rustls::version::TLS12,
-            ])
+            let tls_config = rustls::ServerConfig::builder_with_provider(Arc::new(
+                aws_lc_rs::default_provider(),
+            ))
+            .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
+            .context("aws_lc_rs should support TLS1.2 and TLS1.3")?
             .with_no_client_auth()
             .with_single_cert(cert_chain, key)?
             .into();
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 212e82497f..a7c2cab4a1 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -8,6 +8,7 @@ use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pq_proto::StartupMessageParams;
 use rustls::client::danger::ServerCertVerifier;
+use rustls::crypto::aws_lc_rs;
 use rustls::pki_types::InvalidDnsNameError;
 use thiserror::Error;
 use tokio::net::TcpStream;
@@ -38,6 +39,9 @@ pub(crate) enum ConnectionError {
     #[error("{COULD_NOT_CONNECT}: {0}")]
     CouldNotConnect(#[from] io::Error),
 
+    #[error("Couldn't load native TLS certificates: {0:?}")]
+    TlsCertificateError(Vec<rustls_native_certs::Error>),
+
     #[error("{COULD_NOT_CONNECT}: {0}")]
     TlsError(#[from] InvalidDnsNameError),
 
@@ -84,6 +88,7 @@ impl ReportableError for ConnectionError {
             }
             ConnectionError::Postgres(_) => crate::error::ErrorKind::Compute,
             ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute,
+            ConnectionError::TlsCertificateError(_) => crate::error::ErrorKind::Service,
             ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute,
             ConnectionError::WakeComputeError(e) => e.get_error_kind(),
             ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(),
@@ -293,12 +298,20 @@ impl ConnCfg {
         let client_config = if allow_self_signed_compute {
             // Allow all certificates for creating the connection
             let verifier = Arc::new(AcceptEverythingVerifier);
-            rustls::ClientConfig::builder()
+            rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+                .with_safe_default_protocol_versions()
+                .expect("aws_lc_rs should support the default protocol versions")
                 .dangerous()
                 .with_custom_certificate_verifier(verifier)
         } else {
-            let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone();
-            rustls::ClientConfig::builder().with_root_certificates(root_store)
+            let root_store = TLS_ROOTS
+                .get_or_try_init(load_certs)
+                .map_err(ConnectionError::TlsCertificateError)?
+                .clone();
+            rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+                .with_safe_default_protocol_versions()
+                .expect("aws_lc_rs should support the default protocol versions")
+                .with_root_certificates(root_store)
         };
         let client_config = client_config.with_no_client_auth();
 
@@ -359,10 +372,15 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
     Some(options)
 }
 
-fn load_certs() -> Result<Arc<rustls::RootCertStore>, io::Error> {
-    let der_certs = rustls_native_certs::load_native_certs()?;
+fn load_certs() -> Result<Arc<rustls::RootCertStore>, Vec<rustls_native_certs::Error>> {
+    let der_certs = rustls_native_certs::load_native_certs();
+
+    if !der_certs.errors.is_empty() {
+        return Err(der_certs.errors);
+    }
+
     let mut store = rustls::RootCertStore::empty();
-    store.add_parsable_certificates(der_certs);
+    store.add_parsable_certificates(der_certs.certs);
     Ok(Arc::new(store))
 }
 static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 2ec8c7adda..0d5ebd88f9 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -7,7 +7,7 @@ use anyhow::{bail, ensure, Context, Ok};
 use clap::ValueEnum;
 use itertools::Itertools;
 use remote_storage::RemoteStorageConfig;
-use rustls::crypto::ring::sign;
+use rustls::crypto::aws_lc_rs::{self, sign};
 use rustls::pki_types::{CertificateDer, PrivateKeyDer};
 use sha2::{Digest, Sha256};
 use tracing::{error, info};
@@ -126,12 +126,12 @@ pub fn configure_tls(
     let cert_resolver = Arc::new(cert_resolver);
 
     // allow TLS 1.2 to be compatible with older client libraries
-    let mut config = rustls::ServerConfig::builder_with_protocol_versions(&[
-        &rustls::version::TLS13,
-        &rustls::version::TLS12,
-    ])
-    .with_no_client_auth()
-    .with_cert_resolver(cert_resolver.clone());
+    let mut config =
+        rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+            .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
+            .context("aws_lc_rs should support TLS1.2 and TLS1.3")?
+            .with_no_client_auth()
+            .with_cert_resolver(cert_resolver.clone());
 
     config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()];
 
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index e50ae4bc93..88175d73b1 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -9,6 +9,7 @@ use async_trait::async_trait;
 use http::StatusCode;
 use retry::{retry_after, ShouldRetryWakeCompute};
 use rstest::rstest;
+use rustls::crypto::aws_lc_rs;
 use rustls::pki_types;
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::{MakeTlsConnect, NoTls};
@@ -38,25 +39,27 @@ fn generate_certs(
     pki_types::CertificateDer<'static>,
     pki_types::PrivateKeyDer<'static>,
 )> {
-    let ca = rcgen::Certificate::from_params({
+    let ca_key = rcgen::KeyPair::generate()?;
+    let ca = {
         let mut params = rcgen::CertificateParams::default();
         params.is_ca = rcgen::IsCa::Ca(rcgen::BasicConstraints::Unconstrained);
-        params
-    })?;
+        params.self_signed(&ca_key)?
+    };
 
-    let cert = rcgen::Certificate::from_params({
-        let mut params = rcgen::CertificateParams::new(vec![hostname.into()]);
+    let cert_key = rcgen::KeyPair::generate()?;
+    let cert = {
+        let mut params = rcgen::CertificateParams::new(vec![hostname.into()])?;
         params.distinguished_name = rcgen::DistinguishedName::new();
         params
             .distinguished_name
             .push(rcgen::DnType::CommonName, common_name);
-        params
-    })?;
+        params.signed_by(&cert_key, &ca, &ca_key)?
+    };
 
     Ok((
-        pki_types::CertificateDer::from(ca.serialize_der()?),
-        pki_types::CertificateDer::from(cert.serialize_der_with_signer(&ca)?),
-        pki_types::PrivateKeyDer::Pkcs8(cert.serialize_private_key_der().into()),
+        ca.der().clone(),
+        cert.der().clone(),
+        pki_types::PrivateKeyDer::Pkcs8(cert_key.serialize_der().into()),
     ))
 }
 
@@ -90,10 +93,13 @@ fn generate_tls_config<'a>(
     let (ca, cert, key) = generate_certs(hostname, common_name)?;
 
     let tls_config = {
-        let config = rustls::ServerConfig::builder()
-            .with_no_client_auth()
-            .with_single_cert(vec![cert.clone()], key.clone_key())?
-            .into();
+        let config =
+            rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+                .with_safe_default_protocol_versions()
+                .context("aws_lc_rs should support the default protocol versions")?
+                .with_no_client_auth()
+                .with_single_cert(vec![cert.clone()], key.clone_key())?
+                .into();
 
         let mut cert_resolver = CertResolver::new();
         cert_resolver.add_cert(key, vec![cert], true)?;
@@ -108,13 +114,16 @@ fn generate_tls_config<'a>(
     };
 
     let client_config = {
-        let config = rustls::ClientConfig::builder()
-            .with_root_certificates({
-                let mut store = rustls::RootCertStore::empty();
-                store.add(ca)?;
-                store
-            })
-            .with_no_client_auth();
+        let config =
+            rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+                .with_safe_default_protocol_versions()
+                .context("aws_lc_rs should support the default protocol versions")?
+                .with_root_certificates({
+                    let mut store = rustls::RootCertStore::empty();
+                    store.add(ca)?;
+                    store
+                })
+                .with_no_client_auth();
 
         ClientConfig { config, hostname }
     };
diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
index 15f3665fac..6c312d0036 100644
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -1,10 +1,12 @@
 use std::{collections::HashSet, str::FromStr, sync::Arc};
 
+use anyhow::{bail, Context};
 use futures::stream::{StreamExt, TryStreamExt};
 use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use postgres_ffi::{XLogFileName, PG_TLI};
 use remote_storage::GenericRemoteStorage;
+use rustls::crypto::aws_lc_rs;
 use serde::Serialize;
 use tokio_postgres::types::PgLsn;
 use tracing::{debug, error, info};
@@ -231,10 +233,15 @@ async fn check_timeline(
     })
 }
 
-fn load_certs() -> Result<Arc<rustls::RootCertStore>, std::io::Error> {
-    let der_certs = rustls_native_certs::load_native_certs()?;
+fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
+    let der_certs = rustls_native_certs::load_native_certs();
+
+    if !der_certs.errors.is_empty() {
+        bail!("could not load native tls certs: {:?}", der_certs.errors);
+    }
+
     let mut store = rustls::RootCertStore::empty();
-    store.add_parsable_certificates(der_certs);
+    store.add_parsable_certificates(der_certs.certs);
     Ok(Arc::new(store))
 }
 static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
@@ -248,9 +255,12 @@ async fn load_timelines_from_db(
 
     // Use rustls (Neon requires TLS)
     let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone();
-    let client_config = rustls::ClientConfig::builder()
-        .with_root_certificates(root_store)
-        .with_no_client_auth();
+    let client_config =
+        rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+            .with_safe_default_protocol_versions()
+            .context("aws_lc_rs should support the default protocol versions")?
+            .with_root_certificates(root_store)
+            .with_no_client_auth();
     let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
     let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
     // The connection object performs the actual communication with the database,
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 1347d6ddff..28c51b8ac1 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -32,7 +32,6 @@ deranged = { version = "0.3", default-features = false, features = ["powerfmt",
 digest = { version = "0.10", features = ["mac", "oid", "std"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
-futures = { version = "0.3" }
 futures-channel = { version = "0.3", features = ["sink"] }
 futures-executor = { version = "0.3" }
 futures-io = { version = "0.3" }
@@ -48,7 +47,7 @@ hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"]
 hyper-util = { version = "0.1", features = ["client-legacy", "server-auto", "service"] }
 indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] }
 indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] }
-itertools = { version = "0.12" }
+itertools = { version = "0.10" }
 lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
@@ -66,6 +65,8 @@ regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }
 reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] }
+rustls = { version = "0.23", features = ["ring"] }
+rustls-webpki = { version = "0.102", default-features = false, features = ["aws_lc_rs", "ring", "std"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["alloc", "raw_value"] }
@@ -79,6 +80,7 @@ tikv-jemalloc-sys = { version = "0.5" }
 time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2", features = ["with-serde_json-1"] }
+tokio-rustls = { version = "0.26", features = ["ring"] }
 tokio-stream = { version = "0.1", features = ["net"] }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }
 toml_edit = { version = "0.22", features = ["serde"] }
@@ -104,7 +106,7 @@ half = { version = "2", default-features = false, features = ["num-traits"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] }
 indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] }
-itertools = { version = "0.12" }
+itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
@@ -122,8 +124,7 @@ regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }
 serde = { version = "1", features = ["alloc", "derive"] }
-syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] }
-syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
+syn = { version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
 time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }
 toml_edit = { version = "0.22", features = ["serde"] }
 zstd = { version = "0.13" }

From b8304f90d6ad9a5f118a59ac392b3330495827d3 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 18 Oct 2024 10:27:50 +0100
Subject: [PATCH 47/57] 2024 oct new clippy lints (#9448)

Fixes new lints from `cargo +nightly clippy` (`clippy 0.1.83 (798fb83f
2024-10-16)`)
---
 compute_tools/src/extension_server.rs         |  2 +-
 .../pageserver_api/src/models/partitioning.rs |  6 ++--
 libs/postgres_backend/src/lib.rs              |  3 +-
 libs/pq_proto/src/lib.rs                      |  2 +-
 libs/tenant_size_model/src/svg.rs             |  2 +-
 libs/tracing-utils/src/http.rs                |  2 +-
 libs/utils/src/lsn.rs                         |  2 +-
 libs/utils/src/poison.rs                      |  4 +--
 libs/utils/src/shard.rs                       |  2 +-
 libs/utils/src/simple_rcu.rs                  |  4 +--
 libs/utils/src/sync/heavier_once_cell.rs      |  4 +--
 libs/utils/src/tracing_span_assert.rs         | 10 +++----
 pageserver/compaction/src/helpers.rs          | 10 +++----
 pageserver/src/consumption_metrics/upload.rs  |  2 +-
 pageserver/src/disk_usage_eviction_task.rs    |  2 +-
 pageserver/src/metrics.rs                     |  4 +--
 pageserver/src/statvfs.rs                     |  2 +-
 pageserver/src/tenant/block_io.rs             |  4 +--
 pageserver/src/tenant/disk_btree.rs           |  2 +-
 .../src/tenant/remote_timeline_client.rs      |  2 +-
 .../src/tenant/secondary/heatmap_uploader.rs  |  1 -
 pageserver/src/tenant/storage_layer.rs        |  2 +-
 .../src/tenant/storage_layer/delta_layer.rs   |  3 +-
 .../src/tenant/storage_layer/image_layer.rs   |  3 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  2 +-
 .../src/tenant/storage_layer/layer_name.rs    |  2 +-
 .../tenant/storage_layer/merge_iterator.rs    |  8 +++---
 pageserver/src/tenant/vectored_blob_io.rs     | 21 +++-----------
 pageserver/src/virtual_file.rs                |  4 +--
 proxy/src/auth/credentials.rs                 |  2 +-
 proxy/src/config.rs                           |  2 +-
 proxy/src/context/parquet.rs                  |  2 +-
 proxy/src/intern.rs                           |  2 +-
 proxy/src/lib.rs                              |  6 +---
 proxy/src/proxy/tests/mod.rs                  | 10 +++----
 proxy/src/scram/exchange.rs                   |  4 ---
 proxy/src/serverless/conn_pool.rs             | 12 ++++----
 proxy/src/serverless/conn_pool_lib.rs         | 28 +++++++++----------
 proxy/src/serverless/http_conn_pool.rs        |  3 +-
 proxy/src/serverless/json.rs                  |  6 ++--
 proxy/src/serverless/local_conn_pool.rs       |  3 +-
 proxy/src/serverless/sql_over_http.rs         |  1 -
 proxy/src/usage_metrics.rs                    | 10 +++----
 proxy/src/waiters.rs                          |  2 +-
 safekeeper/src/timeline.rs                    |  6 ++--
 45 files changed, 92 insertions(+), 124 deletions(-)

diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index 6ef7e0837f..da2d107b54 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -107,7 +107,7 @@ pub fn get_pg_version(pgbin: &str) -> String {
     // pg_config --version returns a (platform specific) human readable string
     // such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc.
     let human_version = get_pg_config("--version", pgbin);
-    return parse_pg_version(&human_version).to_string();
+    parse_pg_version(&human_version).to_string()
 }
 
 fn parse_pg_version(human_version: &str) -> &str {
diff --git a/libs/pageserver_api/src/models/partitioning.rs b/libs/pageserver_api/src/models/partitioning.rs
index f6644be635..69832b9a0d 100644
--- a/libs/pageserver_api/src/models/partitioning.rs
+++ b/libs/pageserver_api/src/models/partitioning.rs
@@ -16,7 +16,7 @@ impl serde::Serialize for Partitioning {
     {
         pub struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
 
-        impl<'a> serde::Serialize for KeySpace<'a> {
+        impl serde::Serialize for KeySpace<'_> {
             fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
             where
                 S: serde::Serializer,
@@ -44,7 +44,7 @@ impl serde::Serialize for Partitioning {
 
 pub struct WithDisplay<'a, T>(&'a T);
 
-impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
+impl<T: std::fmt::Display> serde::Serialize for WithDisplay<'_, T> {
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
     where
         S: serde::Serializer,
@@ -55,7 +55,7 @@ impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
 
 pub struct KeyRange<'a>(&'a std::ops::Range<crate::key::Key>);
 
-impl<'a> serde::Serialize for KeyRange<'a> {
+impl serde::Serialize for KeyRange<'_> {
     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
     where
         S: serde::Serializer,
diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 085540e7b9..9d274b25e6 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -921,12 +921,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackendReader<IO> {
 /// A futures::AsyncWrite implementation that wraps all data written to it in CopyData
 /// messages.
 ///
-
 pub struct CopyDataWriter<'a, IO> {
     pgb: &'a mut PostgresBackend<IO>,
 }
 
-impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, IO> {
+impl<IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'_, IO> {
     fn poll_write(
         self: Pin<&mut Self>,
         cx: &mut std::task::Context<'_>,
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index a01191bd5d..9ffaaba584 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -727,7 +727,7 @@ pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000";
 pub const SQLSTATE_ADMIN_SHUTDOWN: &[u8; 5] = b"57P01";
 pub const SQLSTATE_SUCCESSFUL_COMPLETION: &[u8; 5] = b"00000";
 
-impl<'a> BeMessage<'a> {
+impl BeMessage<'_> {
     /// Serialize `message` to the given `buf`.
     /// Apart from smart memory managemet, BytesMut is good here as msg len
     /// precedes its body and it is handy to write it down first and then fill
diff --git a/libs/tenant_size_model/src/svg.rs b/libs/tenant_size_model/src/svg.rs
index 0de2890bb4..25ebb1c3d8 100644
--- a/libs/tenant_size_model/src/svg.rs
+++ b/libs/tenant_size_model/src/svg.rs
@@ -97,7 +97,7 @@ pub fn draw_svg(
     Ok(result)
 }
 
-impl<'a> SvgDraw<'a> {
+impl SvgDraw<'_> {
     fn calculate_svg_layout(&mut self) {
         // Find x scale
         let segments = &self.storage.segments;
diff --git a/libs/tracing-utils/src/http.rs b/libs/tracing-utils/src/http.rs
index e6fdf9be45..2168beee88 100644
--- a/libs/tracing-utils/src/http.rs
+++ b/libs/tracing-utils/src/http.rs
@@ -82,7 +82,7 @@ where
 fn extract_remote_context(headers: &HeaderMap) -> opentelemetry::Context {
     struct HeaderExtractor<'a>(&'a HeaderMap);
 
-    impl<'a> opentelemetry::propagation::Extractor for HeaderExtractor<'a> {
+    impl opentelemetry::propagation::Extractor for HeaderExtractor<'_> {
         fn get(&self, key: &str) -> Option<&str> {
             self.0.get(key).and_then(|value| value.to_str().ok())
         }
diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs
index 06d5c27ebf..3ec2c130bd 100644
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -37,7 +37,7 @@ impl<'de> Deserialize<'de> for Lsn {
             is_human_readable_deserializer: bool,
         }
 
-        impl<'de> Visitor<'de> for LsnVisitor {
+        impl Visitor<'_> for LsnVisitor {
             type Value = Lsn;
 
             fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs
index c3e2fba20c..ab9ebb3c5a 100644
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -73,7 +73,7 @@ impl<T> Poison<T> {
 /// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
 pub struct Guard<'a, T>(&'a mut Poison<T>);
 
-impl<'a, T> Guard<'a, T> {
+impl<T> Guard<'_, T> {
     pub fn data(&self) -> &T {
         &self.0.data
     }
@@ -94,7 +94,7 @@ impl<'a, T> Guard<'a, T> {
     }
 }
 
-impl<'a, T> Drop for Guard<'a, T> {
+impl<T> Drop for Guard<'_, T> {
     fn drop(&mut self) {
         match self.0.state {
             State::Clean => {
diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs
index d146010b41..782cddc599 100644
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -164,7 +164,7 @@ impl TenantShardId {
     }
 }
 
-impl<'a> std::fmt::Display for ShardSlug<'a> {
+impl std::fmt::Display for ShardSlug<'_> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
             f,
diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs
index 01750b2aef..6700f86e4a 100644
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -152,7 +152,7 @@ pub struct RcuWriteGuard<'a, V> {
     inner: RwLockWriteGuard<'a, RcuInner<V>>,
 }
 
-impl<'a, V> Deref for RcuWriteGuard<'a, V> {
+impl<V> Deref for RcuWriteGuard<'_, V> {
     type Target = V;
 
     fn deref(&self) -> &V {
@@ -160,7 +160,7 @@ impl<'a, V> Deref for RcuWriteGuard<'a, V> {
     }
 }
 
-impl<'a, V> RcuWriteGuard<'a, V> {
+impl<V> RcuWriteGuard<'_, V> {
     ///
     /// Store a new value. The new value will be written to the Rcu immediately,
     /// and will be immediately seen by any `read` calls that start afterwards.
diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index dc711fb028..66c2065554 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -219,7 +219,7 @@ impl<'a, T> CountWaitingInitializers<'a, T> {
     }
 }
 
-impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
+impl<T> Drop for CountWaitingInitializers<'_, T> {
     fn drop(&mut self) {
         self.0.initializers.fetch_sub(1, Ordering::Relaxed);
     }
@@ -250,7 +250,7 @@ impl<T> std::ops::DerefMut for Guard<'_, T> {
     }
 }
 
-impl<'a, T> Guard<'a, T> {
+impl<T> Guard<'_, T> {
     /// Take the current value, and a new permit for it's deinitialization.
     ///
     /// The permit will be on a semaphore part of the new internal value, and any following
diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs
index d24c81ad0b..add2fa7920 100644
--- a/libs/utils/src/tracing_span_assert.rs
+++ b/libs/utils/src/tracing_span_assert.rs
@@ -184,23 +184,23 @@ mod tests {
 
     struct MemoryIdentity<'a>(&'a dyn Extractor);
 
-    impl<'a> MemoryIdentity<'a> {
+    impl MemoryIdentity<'_> {
         fn as_ptr(&self) -> *const () {
             self.0 as *const _ as *const ()
         }
     }
-    impl<'a> PartialEq for MemoryIdentity<'a> {
+    impl PartialEq for MemoryIdentity<'_> {
         fn eq(&self, other: &Self) -> bool {
             self.as_ptr() == other.as_ptr()
         }
     }
-    impl<'a> Eq for MemoryIdentity<'a> {}
-    impl<'a> Hash for MemoryIdentity<'a> {
+    impl Eq for MemoryIdentity<'_> {}
+    impl Hash for MemoryIdentity<'_> {
         fn hash<H: Hasher>(&self, state: &mut H) {
             self.as_ptr().hash(state);
         }
     }
-    impl<'a> fmt::Debug for MemoryIdentity<'a> {
+    impl fmt::Debug for MemoryIdentity<'_> {
         fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
             write!(f, "{:p}: {}", self.as_ptr(), self.0.id())
         }
diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index 8ed1d16082..9dbb6ecedf 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -133,7 +133,7 @@ enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
     Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
     Unloaded(&'a E::DeltaLayer),
 }
-impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
+impl<E: CompactionJobExecutor> LazyLoadLayer<'_, E> {
     fn min_key(&self) -> E::Key {
         match self {
             Self::Loaded(entries) => entries.front().unwrap().key(),
@@ -147,23 +147,23 @@ impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
         }
     }
 }
-impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
+impl<E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'_, E> {
     fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
         Some(self.cmp(other))
     }
 }
-impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
+impl<E: CompactionJobExecutor> Ord for LazyLoadLayer<'_, E> {
     fn cmp(&self, other: &Self) -> std::cmp::Ordering {
         // reverse order so that we get a min-heap
         (other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn()))
     }
 }
-impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
+impl<E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'_, E> {
     fn eq(&self, other: &Self) -> bool {
         self.cmp(other) == std::cmp::Ordering::Equal
     }
 }
-impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
+impl<E: CompactionJobExecutor> Eq for LazyLoadLayer<'_, E> {}
 
 type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result<Vec<E>>>;
 
diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs
index 0325ee403a..1eb25d337b 100644
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -198,7 +198,7 @@ fn serialize_in_chunks<'a>(
         }
     }
 
-    impl<'a> ExactSizeIterator for Iter<'a> {}
+    impl ExactSizeIterator for Iter<'_> {}
 
     let buffer = bytes::BytesMut::new();
     let inner = input.chunks(chunk_size);
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 7ab2ba8742..ca44fbe6ae 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -654,7 +654,7 @@ impl std::fmt::Debug for EvictionCandidate {
         let ts = chrono::DateTime::<chrono::Utc>::from(self.last_activity_ts);
         let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true);
         struct DisplayIsDebug<'a, T>(&'a T);
-        impl<'a, T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'a, T> {
+        impl<T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'_, T> {
             fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
                 write!(f, "{}", self.0)
             }
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index b76efa5b48..3e824b59fb 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1189,7 +1189,7 @@ struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
     op: SmgrQueryType,
 }
 
-impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
+impl Drop for GlobalAndPerTimelineHistogramTimer<'_, '_> {
     fn drop(&mut self) {
         let elapsed = self.start.elapsed();
         let ex_throttled = self
@@ -1560,7 +1560,7 @@ impl BasebackupQueryTime {
     }
 }
 
-impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
+impl BasebackupQueryTimeOngoingRecording<'_, '_> {
     pub(crate) fn observe<T>(self, res: &Result<T, QueryError>) {
         let elapsed = self.start.elapsed();
         let ex_throttled = self
diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs
index 205605bc86..4e8be58d58 100644
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -90,7 +90,7 @@ pub mod mock {
                 let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap();
 
                 // round it up to the nearest block multiple
-                let used_blocks = (used_bytes + (blocksize - 1)) / blocksize;
+                let used_blocks = used_bytes.div_ceil(*blocksize);
 
                 if used_blocks > *total_blocks {
                     panic!(
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 3afa3a86b9..1c82e5454d 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -50,13 +50,13 @@ impl From<PageReadGuard<'static>> for BlockLease<'static> {
 }
 
 #[cfg(test)]
-impl<'a> From<std::sync::Arc<[u8; PAGE_SZ]>> for BlockLease<'a> {
+impl From<std::sync::Arc<[u8; PAGE_SZ]>> for BlockLease<'_> {
     fn from(value: std::sync::Arc<[u8; PAGE_SZ]>) -> Self {
         BlockLease::Arc(value)
     }
 }
 
-impl<'a> Deref for BlockLease<'a> {
+impl Deref for BlockLease<'_> {
     type Target = [u8; PAGE_SZ];
 
     fn deref(&self) -> &Self::Target {
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index 0107b0ac7e..b302cbc975 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -131,7 +131,7 @@ struct OnDiskNode<'a, const L: usize> {
     values: &'a [u8],
 }
 
-impl<'a, const L: usize> OnDiskNode<'a, L> {
+impl<const L: usize> OnDiskNode<'_, L> {
     ///
     /// Interpret a PAGE_SZ page as a node.
     ///
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 450084aca2..14b894d17c 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2182,7 +2182,7 @@ pub(crate) struct UploadQueueAccessor<'a> {
     inner: std::sync::MutexGuard<'a, UploadQueue>,
 }
 
-impl<'a> UploadQueueAccessor<'a> {
+impl UploadQueueAccessor<'_> {
     pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart {
         match &*self.inner {
             UploadQueue::Initialized(x) => &x.clean.0,
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index 0aad5bf392..e680fd705b 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -108,7 +108,6 @@ impl scheduler::Completion for WriteComplete {
 /// when we last did a write.  We only populate this after doing at least one
 /// write for a tenant -- this avoids holding state for tenants that have
 /// uploads disabled.
-
 struct UploaderTenantState {
     // This Weak only exists to enable culling idle instances of this type
     // when the Tenant has been deallocated.
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 99bd0ece57..a229b59560 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -705,7 +705,7 @@ pub mod tests {
 /// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
 struct RangeDisplayDebug<'a, T: std::fmt::Display>(&'a Range<T>);
 
-impl<'a, T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'a, T> {
+impl<T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'_, T> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{}..{}", self.0.start, self.0.end)
     }
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 8be7d7876f..d1079876f8 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -529,8 +529,7 @@ impl DeltaLayerWriterInner {
         key_end: Key,
         ctx: &RequestContext,
     ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
-        let index_start_blk =
-            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
+        let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32;
 
         let mut file = self.blob_writer.into_inner(ctx).await?;
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index de8155f455..6c1a943470 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -828,8 +828,7 @@ impl ImageLayerWriterInner {
         ctx: &RequestContext,
         end_key: Option<Key>,
     ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
-        let index_start_blk =
-            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
+        let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32;
 
         // Calculate compression ratio
         let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index f29a33bae6..38a7cd09af 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -978,7 +978,7 @@ impl LayerInner {
         let timeline = self
             .timeline
             .upgrade()
-            .ok_or_else(|| DownloadError::TimelineShutdown)?;
+            .ok_or(DownloadError::TimelineShutdown)?;
 
         // count cancellations, which currently remain largely unexpected
         let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
index ffe7ca5f3e..8e750e1187 100644
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -339,7 +339,7 @@ impl<'de> serde::Deserialize<'de> for LayerName {
 
 struct LayerNameVisitor;
 
-impl<'de> serde::de::Visitor<'de> for LayerNameVisitor {
+impl serde::de::Visitor<'_> for LayerNameVisitor {
     type Value = LayerName;
 
     fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index 0831fd9530..f91e27241d 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -99,21 +99,21 @@ impl<'a> PeekableLayerIterRef<'a> {
     }
 }
 
-impl<'a> std::cmp::PartialEq for IteratorWrapper<'a> {
+impl std::cmp::PartialEq for IteratorWrapper<'_> {
     fn eq(&self, other: &Self) -> bool {
         self.cmp(other) == Ordering::Equal
     }
 }
 
-impl<'a> std::cmp::Eq for IteratorWrapper<'a> {}
+impl std::cmp::Eq for IteratorWrapper<'_> {}
 
-impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> {
+impl std::cmp::PartialOrd for IteratorWrapper<'_> {
     fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
         Some(self.cmp(other))
     }
 }
 
-impl<'a> std::cmp::Ord for IteratorWrapper<'a> {
+impl std::cmp::Ord for IteratorWrapper<'_> {
     fn cmp(&self, other: &Self) -> std::cmp::Ordering {
         use std::cmp::Ordering;
         let a = self.peek_next_key_lsn_value();
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 792c769b4f..0c03791034 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -73,7 +73,7 @@ impl<'a> BufView<'a> {
     }
 }
 
-impl<'a> Deref for BufView<'a> {
+impl Deref for BufView<'_> {
     type Target = [u8];
 
     fn deref(&self) -> &Self::Target {
@@ -84,7 +84,7 @@ impl<'a> Deref for BufView<'a> {
     }
 }
 
-impl<'a> AsRef<[u8]> for BufView<'a> {
+impl AsRef<[u8]> for BufView<'_> {
     fn as_ref(&self) -> &[u8] {
         match self {
             BufView::Slice(slice) => slice,
@@ -196,11 +196,6 @@ pub(crate) struct ChunkedVectoredReadBuilder {
     max_read_size: Option<usize>,
 }
 
-/// Computes x / d rounded up.
-fn div_round_up(x: usize, d: usize) -> usize {
-    (x + (d - 1)) / d
-}
-
 impl ChunkedVectoredReadBuilder {
     const CHUNK_SIZE: usize = virtual_file::get_io_buffer_alignment();
     /// Start building a new vectored read.
@@ -220,7 +215,7 @@ impl ChunkedVectoredReadBuilder {
             .expect("First insertion always succeeds");
 
         let start_blk_no = start_offset as usize / Self::CHUNK_SIZE;
-        let end_blk_no = div_round_up(end_offset as usize, Self::CHUNK_SIZE);
+        let end_blk_no = (end_offset as usize).div_ceil(Self::CHUNK_SIZE);
         Self {
             start_blk_no,
             end_blk_no,
@@ -248,7 +243,7 @@ impl ChunkedVectoredReadBuilder {
     pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
         tracing::trace!(start, end, "trying to extend");
         let start_blk_no = start as usize / Self::CHUNK_SIZE;
-        let end_blk_no = div_round_up(end as usize, Self::CHUNK_SIZE);
+        let end_blk_no = (end as usize).div_ceil(Self::CHUNK_SIZE);
 
         let not_limited_by_max_read_size = {
             if let Some(max_read_size) = self.max_read_size {
@@ -975,12 +970,4 @@ mod tests {
         round_trip_test_compressed(&blobs, true).await?;
         Ok(())
     }
-
-    #[test]
-    fn test_div_round_up() {
-        const CHUNK_SIZE: usize = 512;
-        assert_eq!(1, div_round_up(200, CHUNK_SIZE));
-        assert_eq!(1, div_round_up(CHUNK_SIZE, CHUNK_SIZE));
-        assert_eq!(2, div_round_up(CHUNK_SIZE + 1, CHUNK_SIZE));
-    }
 }
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index d260116b38..5a364b7aaf 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -724,9 +724,9 @@ impl VirtualFileInner {
 
         *handle_guard = handle;
 
-        return Ok(FileGuard {
+        Ok(FileGuard {
             slot_guard: slot_guard.downgrade(),
-        });
+        })
     }
 
     pub fn remove(self) {
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index fa6bc4c6f5..465e427f7c 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -193,7 +193,7 @@ impl<'de> serde::de::Deserialize<'de> for IpPattern {
         D: serde::Deserializer<'de>,
     {
         struct StrVisitor;
-        impl<'de> serde::de::Visitor<'de> for StrVisitor {
+        impl serde::de::Visitor<'_> for StrVisitor {
             type Value = IpPattern;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 0d5ebd88f9..3baa7ec751 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -558,7 +558,7 @@ pub struct RetryConfig {
 }
 
 impl RetryConfig {
-    /// Default options for RetryConfig.
+    // Default options for RetryConfig.
 
     /// Total delay for 5 retries with 200ms base delay and 2 backoff factor is about 6s.
     pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str =
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index b0ad0e4566..3432ac5ff6 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -104,7 +104,7 @@ struct Options<'a> {
     options: &'a StartupMessageParams,
 }
 
-impl<'a> serde::Serialize for Options<'a> {
+impl serde::Serialize for Options<'_> {
     fn serialize<S>(&self, s: S) -> Result<S::Ok, S::Error>
     where
         S: serde::Serializer,
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index 09fd9657d0..49aab917e4 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -55,7 +55,7 @@ impl<Id: InternId> std::ops::Deref for InternedString<Id> {
 impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString<Id> {
     fn deserialize<D: serde::de::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
         struct Visitor<Id>(PhantomData<Id>);
-        impl<'de, Id: InternId> serde::de::Visitor<'de> for Visitor<Id> {
+        impl<Id: InternId> serde::de::Visitor<'_> for Visitor<Id> {
             type Value = InternedString<Id>;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 74bc778a36..a7b3d45c95 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -76,11 +76,7 @@
     )
 )]
 // List of temporarily allowed lints to unblock beta/nightly.
-#![allow(
-    unknown_lints,
-    // TODO: 1.82: Add `use<T>` where necessary and remove from this list.
-    impl_trait_overcaptures,
-)]
+#![allow(unknown_lints)]
 
 use std::convert::Infallible;
 
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 88175d73b1..3f54b0661b 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -73,11 +73,11 @@ impl ClientConfig<'_> {
         self,
     ) -> anyhow::Result<
         impl tokio_postgres::tls::TlsConnect<
-            S,
-            Error = impl std::fmt::Debug,
-            Future = impl Send,
-            Stream = RustlsStream<S>,
-        >,
+                S,
+                Error = impl std::fmt::Debug + use<S>,
+                Future = impl Send + use<S>,
+                Stream = RustlsStream<S>,
+            > + use<S>,
     > {
         let mut mk = MakeRustlsConnect::new(self.config);
         let tls = MakeTlsConnect::<S>::make_tls_connect(&mut mk, self.hostname)?;
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index 493295c938..6a13f645a5 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -218,16 +218,12 @@ impl sasl::Mechanism for Exchange<'_> {
                         self.state = ExchangeState::SaltSent(sent);
                         Ok(Step::Continue(self, msg))
                     }
-                    #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match
-                    Step::Success(x, _) => match x {},
                     Step::Failure(msg) => Ok(Step::Failure(msg)),
                 }
             }
             ExchangeState::SaltSent(sent) => {
                 match sent.transition(self.secret, &self.tls_server_end_point, input)? {
                     Step::Success(keys, msg) => Ok(Step::Success(keys, msg)),
-                    #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match
-                    Step::Continue(x, _) => match x {},
                     Step::Failure(msg) => Ok(Step::Failure(msg)),
                 }
             }
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index b97c656510..8401e3a1c9 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -11,13 +11,6 @@ use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::{AsyncMessage, Socket};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
-
-use crate::context::RequestMonitoring;
-use crate::control_plane::messages::MetricsAuxInfo;
-use crate::metrics::Metrics;
-
-use super::conn_pool_lib::{Client, ClientInnerExt, ConnInfo, GlobalConnPool};
-
 #[cfg(test)]
 use {
     super::conn_pool_lib::GlobalConnPoolOptions,
@@ -25,6 +18,11 @@ use {
     std::{sync::atomic, time::Duration},
 };
 
+use super::conn_pool_lib::{Client, ClientInnerExt, ConnInfo, GlobalConnPool};
+use crate::context::RequestMonitoring;
+use crate::control_plane::messages::MetricsAuxInfo;
+use crate::metrics::Metrics;
+
 #[derive(Debug, Clone)]
 pub(crate) struct ConnInfoWithAuth {
     pub(crate) conn_info: ConnInfo,
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index 6e964ce878..844730194d 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -1,25 +1,23 @@
+use std::collections::HashMap;
+use std::ops::Deref;
+use std::sync::atomic::{self, AtomicUsize};
+use std::sync::{Arc, Weak};
+use std::time::Duration;
+
 use dashmap::DashMap;
 use parking_lot::RwLock;
 use rand::Rng;
-use std::{collections::HashMap, sync::Arc, sync::Weak, time::Duration};
-use std::{
-    ops::Deref,
-    sync::atomic::{self, AtomicUsize},
-};
 use tokio_postgres::ReadyForQueryStatus;
+use tracing::{debug, info, Span};
 
+use super::backend::HttpConnError;
+use super::conn_pool::ClientInnerRemote;
+use crate::auth::backend::ComputeUserInfo;
+use crate::context::RequestMonitoring;
 use crate::control_plane::messages::ColdStartInfo;
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-use crate::{
-    auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName,
-};
-
-use super::conn_pool::ClientInnerRemote;
-use tracing::info;
-use tracing::{debug, Span};
-
-use super::backend::HttpConnError;
+use crate::{DbName, EndpointCacheKey, RoleName};
 
 #[derive(Debug, Clone)]
 pub(crate) struct ConnInfo {
@@ -482,7 +480,7 @@ impl<C: ClientInnerExt> Client<C> {
         })
     }
 
-    pub(crate) fn do_drop(&mut self) -> Option<impl FnOnce()> {
+    pub(crate) fn do_drop(&mut self) -> Option<impl FnOnce() + use<C>> {
         let conn_info = self.conn_info.clone();
         let client = self
             .inner
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index 79bb19328f..363e397976 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -10,12 +10,11 @@ use rand::Rng;
 use tokio::net::TcpStream;
 use tracing::{debug, error, info, info_span, Instrument};
 
+use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-
-use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
 use crate::EndpointCacheKey;
 
 pub(crate) type Send = http2::SendRequest<hyper::body::Incoming>;
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index 8c56d317cc..569e2da571 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -155,10 +155,10 @@ fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, Json
 // dimensions, we just return them as is.
 //
 fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result<Value, JsonConversionError> {
-    _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v)
+    pg_array_parse_inner(pg_array, elem_type, false).map(|(v, _)| v)
 }
 
-fn _pg_array_parse(
+fn pg_array_parse_inner(
     pg_array: &str,
     elem_type: &Type,
     nested: bool,
@@ -211,7 +211,7 @@ fn _pg_array_parse(
             '{' if !quote => {
                 level += 1;
                 if level > 1 {
-                    let (res, off) = _pg_array_parse(&pg_array[i..], elem_type, true)?;
+                    let (res, off) = pg_array_parse_inner(&pg_array[i..], elem_type, true)?;
                     entries.push(res);
                     for _ in 0..off - 1 {
                         pg_array_chr.next();
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index c4fdd00f78..a01afd2820 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -25,7 +25,6 @@ use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::Metrics;
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-
 use crate::{DbName, RoleName};
 
 struct ConnPoolEntry<C: ClientInnerExt> {
@@ -530,7 +529,7 @@ impl<C: ClientInnerExt> LocalClient<C> {
         })
     }
 
-    fn do_drop(&mut self) -> Option<impl FnOnce()> {
+    fn do_drop(&mut self) -> Option<impl FnOnce() + use<C>> {
         let conn_info = self.conn_info.clone();
         let client = self
             .inner
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index bb5eb390a6..6fbb044669 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -38,7 +38,6 @@ use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::metrics::{HttpDirection, Metrics};
 use crate::proxy::{run_until_cancelled, NeonOptions};
 use crate::serverless::backend::HttpConnError;
-
 use crate::usage_metrics::{MetricCounter, MetricCounterRecorder};
 use crate::{DbName, RoleName};
 
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index c5384c0b0e..f944d5aec3 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -375,7 +375,7 @@ pub async fn task_backup(
         let now = Utc::now();
         collect_metrics_backup_iteration(
             &USAGE_METRICS.backup_endpoints,
-            &storage,
+            storage.as_ref(),
             &hostname,
             prev,
             now,
@@ -395,7 +395,7 @@ pub async fn task_backup(
 #[instrument(skip_all)]
 async fn collect_metrics_backup_iteration(
     endpoints: &DashMap<Ids, Arc<MetricBackupCounter>, FastHasher>,
-    storage: &Option<GenericRemoteStorage>,
+    storage: Option<&GenericRemoteStorage>,
     hostname: &str,
     prev: DateTime<Utc>,
     now: DateTime<Utc>,
@@ -446,7 +446,7 @@ async fn collect_metrics_backup_iteration(
 }
 
 async fn upload_events_chunk(
-    storage: &Option<GenericRemoteStorage>,
+    storage: Option<&GenericRemoteStorage>,
     chunk: EventChunk<'_, Event<Ids, &'static str>>,
     remote_path: &RemotePath,
     cancel: &CancellationToken,
@@ -577,10 +577,10 @@ mod tests {
         // counter is unregistered
         assert!(metrics.endpoints.is_empty());
 
-        collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000)
+        collect_metrics_backup_iteration(&metrics.backup_endpoints, None, "foo", now, now, 1000)
             .await;
         assert!(!metrics.backup_endpoints.is_empty());
-        collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000)
+        collect_metrics_backup_iteration(&metrics.backup_endpoints, None, "foo", now, now, 1000)
             .await;
         // backup counter is unregistered after the second iteration
         assert!(metrics.backup_endpoints.is_empty());
diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs
index 7e07f6a2af..330e73f02f 100644
--- a/proxy/src/waiters.rs
+++ b/proxy/src/waiters.rs
@@ -73,7 +73,7 @@ struct DropKey<'a, T> {
     registry: &'a Waiters<T>,
 }
 
-impl<'a, T> Drop for DropKey<'a, T> {
+impl<T> Drop for DropKey<'_, T> {
     fn drop(&mut self) {
         self.registry.0.lock().remove(&self.key);
     }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 3494b0b764..41b9490088 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -122,7 +122,7 @@ impl<'a> WriteGuardSharedState<'a> {
     }
 }
 
-impl<'a> Deref for WriteGuardSharedState<'a> {
+impl Deref for WriteGuardSharedState<'_> {
     type Target = SharedState;
 
     fn deref(&self) -> &Self::Target {
@@ -130,13 +130,13 @@ impl<'a> Deref for WriteGuardSharedState<'a> {
     }
 }
 
-impl<'a> DerefMut for WriteGuardSharedState<'a> {
+impl DerefMut for WriteGuardSharedState<'_> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         &mut self.guard
     }
 }
 
-impl<'a> Drop for WriteGuardSharedState<'a> {
+impl Drop for WriteGuardSharedState<'_> {
     fn drop(&mut self) {
         let term_flush_lsn =
             TermLsn::from((self.guard.sk.last_log_term(), self.guard.sk.flush_lsn()));

From 24654b8eee8706e8ae98948733a28b56df83536b Mon Sep 17 00:00:00 2001
From: Jere Vaara <jerevaara@gmail.com>
Date: Fri, 18 Oct 2024 13:25:45 +0300
Subject: [PATCH 48/57] compute_ctl: Add endpoint that allows setting role
 grants (#9395)

This PR introduces a `/grants` endpoint which allows setting specific
`privileges` to certain `role` for a certain `schema`.

Related to #9344

Together these endpoints will be used to configure JWT extension and set
correct usage to its schema to specific roles that will need them.

---------

Co-authored-by: Conrad Ludgate <conradludgate@gmail.com>
---
 compute_tools/src/compute.rs             | 43 ++++++++++++
 compute_tools/src/http/api.rs            | 48 ++++++++++++-
 compute_tools/src/http/openapi_spec.yaml | 89 ++++++++++++++++++++++++
 libs/compute_api/src/lib.rs              |  1 +
 libs/compute_api/src/privilege.rs        | 35 ++++++++++
 libs/compute_api/src/requests.rs         | 13 +++-
 libs/compute_api/src/responses.rs        | 13 +++-
 test_runner/fixtures/endpoint/http.py    |  8 +++
 test_runner/regress/test_role_grants.py  | 41 +++++++++++
 9 files changed, 287 insertions(+), 4 deletions(-)
 create mode 100644 libs/compute_api/src/privilege.rs
 create mode 100644 test_runner/regress/test_role_grants.py

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 6aec008f3a..11fee73f03 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -15,6 +15,7 @@ use std::time::Instant;
 
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
+use compute_api::spec::PgIdent;
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
@@ -25,6 +26,7 @@ use tracing::{debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 
+use compute_api::privilege::Privilege;
 use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;
@@ -1373,6 +1375,47 @@ LIMIT 100",
         download_size
     }
 
+    pub async fn set_role_grants(
+        &self,
+        db_name: &PgIdent,
+        schema_name: &PgIdent,
+        privileges: &[Privilege],
+        role_name: &PgIdent,
+    ) -> Result<()> {
+        use tokio_postgres::config::Config;
+        use tokio_postgres::NoTls;
+
+        let mut conf = Config::from_str(self.connstr.as_str()).unwrap();
+        conf.dbname(db_name);
+
+        let (db_client, conn) = conf
+            .connect(NoTls)
+            .await
+            .context("Failed to connect to the database")?;
+        tokio::spawn(conn);
+
+        // TODO: support other types of grants apart from schemas?
+        let query = format!(
+            "GRANT {} ON SCHEMA {} TO {}",
+            privileges
+                .iter()
+                // should not be quoted as it's part of the command.
+                // is already sanitized so it's ok
+                .map(|p| p.as_str())
+                .collect::<Vec<&'static str>>()
+                .join(", "),
+            // quote the schema and role name as identifiers to sanitize them.
+            schema_name.pg_quote(),
+            role_name.pg_quote(),
+        );
+        db_client
+            .simple_query(&query)
+            .await
+            .with_context(|| format!("Failed to execute query: {}", query))?;
+
+        Ok(())
+    }
+
     #[tokio::main]
     pub async fn prepare_preload_libraries(
         &self,
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 79e6158081..133ab9f5af 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -9,8 +9,10 @@ use crate::catalog::SchemaDumpError;
 use crate::catalog::{get_database_schema, get_dbs_and_roles};
 use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
-use compute_api::requests::ConfigurationRequest;
-use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
+use compute_api::requests::{ConfigurationRequest, SetRoleGrantsRequest};
+use compute_api::responses::{
+    ComputeStatus, ComputeStatusResponse, GenericAPIError, SetRoleGrantsResponse,
+};
 
 use anyhow::Result;
 use hyper::header::CONTENT_TYPE;
@@ -165,6 +167,48 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
             }
         }
 
+        (&Method::POST, "/grants") => {
+            info!("serving /grants POST request");
+            let status = compute.get_status();
+            if status != ComputeStatus::Running {
+                let msg = format!(
+                    "invalid compute status for set_role_grants request: {:?}",
+                    status
+                );
+                error!(msg);
+                return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
+            }
+
+            let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
+            let request = serde_json::from_slice::<SetRoleGrantsRequest>(&request).unwrap();
+
+            let res = compute
+                .set_role_grants(
+                    &request.database,
+                    &request.schema,
+                    &request.privileges,
+                    &request.role,
+                )
+                .await;
+            match res {
+                Ok(()) => render_json(Body::from(
+                    serde_json::to_string(&SetRoleGrantsResponse {
+                        database: request.database,
+                        schema: request.schema,
+                        role: request.role,
+                        privileges: request.privileges,
+                    })
+                    .unwrap(),
+                )),
+                Err(e) => render_json_error(
+                    &format!("could not grant role privileges to the schema: {e}"),
+                    // TODO: can we filter on role/schema not found errors
+                    // and return appropriate error code?
+                    StatusCode::INTERNAL_SERVER_ERROR,
+                ),
+            }
+        }
+
         // get the list of installed extensions
         // currently only used in python tests
         // TODO: call it from cplane
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index e9fa66b323..73dbdc3ee9 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -127,6 +127,41 @@ paths:
               schema:
                 $ref: "#/components/schemas/GenericError"
 
+  /grants:
+    post:
+      tags:
+        - Grants
+      summary: Apply grants to the database.
+      description: ""
+      operationId: setRoleGrants
+      requestBody:
+        description: Grants request.
+        required: true
+        content:
+          application/json:
+            schema:
+                $ref: "#/components/schemas/SetRoleGrantsRequest"
+      responses:
+        200:
+          description: Grants applied.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/SetRoleGrantsResponse"
+        412:
+          description: |
+            Compute is not in the right state for processing the request.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+        500:
+          description: Error occurred during grants application.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+
   /check_writability:
     post:
       tags:
@@ -427,6 +462,60 @@ components:
               n_databases:
                 type: integer
 
+    SetRoleGrantsRequest:
+      type: object
+      required:
+        - database
+        - schema
+        - privileges
+        - role
+      properties:
+        database:
+          type: string
+          description: Database name.
+          example: "neondb"
+        schema:
+          type: string
+          description: Schema name.
+          example: "public"
+        privileges:
+          type: array
+          items:
+            type: string
+          description: List of privileges to set.
+          example: ["SELECT", "INSERT"]
+        role:
+          type: string
+          description: Role name.
+          example: "neon"
+
+    SetRoleGrantsResponse:
+      type: object
+      required:
+        - database
+        - schema
+        - privileges
+        - role
+      properties:
+        database:
+          type: string
+          description: Database name.
+          example: "neondb"
+        schema:
+          type: string
+          description: Schema name.
+          example: "public"
+        privileges:
+          type: array
+          items:
+            type: string
+          description: List of privileges set.
+          example: ["SELECT", "INSERT"]
+        role:
+          type: string
+          description: Role name.
+          example: "neon"
+
     #
     # Errors
     #
diff --git a/libs/compute_api/src/lib.rs b/libs/compute_api/src/lib.rs
index 210a52d089..f4f3d92fc6 100644
--- a/libs/compute_api/src/lib.rs
+++ b/libs/compute_api/src/lib.rs
@@ -1,5 +1,6 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
+pub mod privilege;
 pub mod requests;
 pub mod responses;
 pub mod spec;
diff --git a/libs/compute_api/src/privilege.rs b/libs/compute_api/src/privilege.rs
new file mode 100644
index 0000000000..dc0d870946
--- /dev/null
+++ b/libs/compute_api/src/privilege.rs
@@ -0,0 +1,35 @@
+#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
+#[serde(rename_all = "UPPERCASE")]
+pub enum Privilege {
+    Select,
+    Insert,
+    Update,
+    Delete,
+    Truncate,
+    References,
+    Trigger,
+    Usage,
+    Create,
+    Connect,
+    Temporary,
+    Execute,
+}
+
+impl Privilege {
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            Privilege::Select => "SELECT",
+            Privilege::Insert => "INSERT",
+            Privilege::Update => "UPDATE",
+            Privilege::Delete => "DELETE",
+            Privilege::Truncate => "TRUNCATE",
+            Privilege::References => "REFERENCES",
+            Privilege::Trigger => "TRIGGER",
+            Privilege::Usage => "USAGE",
+            Privilege::Create => "CREATE",
+            Privilege::Connect => "CONNECT",
+            Privilege::Temporary => "TEMPORARY",
+            Privilege::Execute => "EXECUTE",
+        }
+    }
+}
diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs
index 5896c7dc65..fbc7577dd9 100644
--- a/libs/compute_api/src/requests.rs
+++ b/libs/compute_api/src/requests.rs
@@ -1,6 +1,9 @@
 //! Structs representing the JSON formats used in the compute_ctl's HTTP API.
 
-use crate::spec::ComputeSpec;
+use crate::{
+    privilege::Privilege,
+    spec::{ComputeSpec, PgIdent},
+};
 use serde::Deserialize;
 
 /// Request of the /configure API
@@ -12,3 +15,11 @@ use serde::Deserialize;
 pub struct ConfigurationRequest {
     pub spec: ComputeSpec,
 }
+
+#[derive(Deserialize, Debug)]
+pub struct SetRoleGrantsRequest {
+    pub database: PgIdent,
+    pub schema: PgIdent,
+    pub privileges: Vec<Privilege>,
+    pub role: PgIdent,
+}
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index 5023fce003..fadf524273 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -6,7 +6,10 @@ use std::fmt::Display;
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize, Serializer};
 
-use crate::spec::{ComputeSpec, Database, Role};
+use crate::{
+    privilege::Privilege,
+    spec::{ComputeSpec, Database, PgIdent, Role},
+};
 
 #[derive(Serialize, Debug, Deserialize)]
 pub struct GenericAPIError {
@@ -168,3 +171,11 @@ pub struct InstalledExtension {
 pub struct InstalledExtensions {
     pub extensions: Vec<InstalledExtension>,
 }
+
+#[derive(Clone, Debug, Default, Serialize)]
+pub struct SetRoleGrantsResponse {
+    pub database: PgIdent,
+    pub schema: PgIdent,
+    pub privileges: Vec<Privilege>,
+    pub role: PgIdent,
+}
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
index 26895df8a6..e7b014b4a9 100644
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -28,3 +28,11 @@ class EndpointHttpClient(requests.Session):
         res = self.get(f"http://localhost:{self.port}/installed_extensions")
         res.raise_for_status()
         return res.json()
+
+    def set_role_grants(self, database: str, role: str, schema: str, privileges: list[str]):
+        res = self.post(
+            f"http://localhost:{self.port}/grants",
+            json={"database": database, "schema": schema, "role": role, "privileges": privileges},
+        )
+        res.raise_for_status()
+        return res.json()
diff --git a/test_runner/regress/test_role_grants.py b/test_runner/regress/test_role_grants.py
new file mode 100644
index 0000000000..b2251875f0
--- /dev/null
+++ b/test_runner/regress/test_role_grants.py
@@ -0,0 +1,41 @@
+import psycopg2
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_role_grants(neon_simple_env: NeonEnv):
+    """basic test for the endpoint that grants permissions for a role against a schema"""
+
+    env = neon_simple_env
+
+    env.create_branch("test_role_grants")
+
+    endpoint = env.endpoints.create_start("test_role_grants")
+
+    endpoint.safe_psql("CREATE DATABASE test_role_grants")
+    endpoint.safe_psql("CREATE SCHEMA IF NOT EXISTS test_schema", dbname="test_role_grants")
+    endpoint.safe_psql("CREATE ROLE test_role WITH LOGIN", dbname="test_role_grants")
+
+    # confirm we do not yet have access
+    pg_conn = endpoint.connect(dbname="test_role_grants", user="test_role")
+    with pg_conn.cursor() as cur:
+        try:
+            cur.execute('CREATE TABLE "test_schema"."test_table" (id integer primary key)')
+            raise ValueError("create table should not succeed")
+        except psycopg2.errors.InsufficientPrivilege:
+            pass
+        except BaseException as e:
+            raise e
+
+    client = endpoint.http_client()
+    res = client.set_role_grants(
+        "test_role_grants", "test_role", "test_schema", ["CREATE", "USAGE"]
+    )
+
+    # confirm we have access
+    with pg_conn.cursor() as cur:
+        cur.execute('CREATE TABLE "test_schema"."test_table" (id integer primary key)')
+        cur.execute('INSERT INTO "test_schema"."test_table" (id) VALUES (1)')
+        cur.execute('SELECT id from "test_schema"."test_table"')
+        res = cur.fetchall()
+
+        assert res == [(1,)], "select should not succeed"

From b7173b1ef05f694f3fa7968dadc4a298ea6d66e8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 18 Oct 2024 11:29:23 +0100
Subject: [PATCH 49/57] storcon: fix case where we might fail to send compute
 notifications after two opposite migrations  (#9435)

## Problem

If we migrate A->B, then B->A, and the notification of A->B fails, then
we might have retained state that makes us think "A" is the last state
we sent to the compute hook, whereas when we migrate B->A we should
really be sending a fresh notification in case our earlier failed
notification has actually mutated the remote compute config.

Closes: #9417

## Summary of changes

- Add a reproducer for the bug
(`test_storage_controller_compute_hook_revert`)
- Refactor compute hook code to represent remote state with
`ComputeRemoteState` which stores a boolean for whether the compute has
fully applied the change as well as the request that the compute
accepted.
- The actual bug fix: after sending a compute notification, if we got a
423 response then update our ComputeRemoteState to reflect that we have
mutated the remote state. This way, when we later try and notify for our
historic location, we will properly see that as a change and send the
notification.

Co-authored-by: Vlad Lazar <vlad@neon.tech>
---
 storage_controller/src/compute_hook.rs        |  80 ++++++++---
 .../regress/test_storage_controller.py        | 127 ++++++++++++++++--
 2 files changed, 183 insertions(+), 24 deletions(-)

diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index bafae1f551..b63a322b87 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -28,7 +28,7 @@ struct UnshardedComputeHookTenant {
     node_id: NodeId,
 
     // Must hold this lock to send a notification.
-    send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
+    send_lock: Arc<tokio::sync::Mutex<Option<ComputeRemoteState>>>,
 }
 struct ShardedComputeHookTenant {
     stripe_size: ShardStripeSize,
@@ -38,7 +38,22 @@ struct ShardedComputeHookTenant {
     // Must hold this lock to send a notification.  The contents represent
     // the last successfully sent notification, and are used to coalesce multiple
     // updates by only sending when there is a chance since our last successful send.
-    send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
+    send_lock: Arc<tokio::sync::Mutex<Option<ComputeRemoteState>>>,
+}
+
+/// Represents our knowledge of the compute's state: we can update this when we get a
+/// response from a notify API call, which tells us what has been applied.
+///
+/// Should be wrapped in an Option<>, as we cannot always know the remote state.
+#[derive(PartialEq, Eq, Debug)]
+struct ComputeRemoteState {
+    // The request body which was acked by the compute
+    request: ComputeHookNotifyRequest,
+
+    // Whether the cplane indicated that the state was applied to running computes, or just
+    // persisted.  In the Neon control plane, this is the difference between a 423 response (meaning
+    // persisted but not applied), and a 2xx response (both persisted and applied)
+    applied: bool,
 }
 
 enum ComputeHookTenant {
@@ -64,7 +79,7 @@ impl ComputeHookTenant {
         }
     }
 
-    fn get_send_lock(&self) -> &Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>> {
+    fn get_send_lock(&self) -> &Arc<tokio::sync::Mutex<Option<ComputeRemoteState>>> {
         match self {
             Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock,
             Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock,
@@ -188,11 +203,11 @@ enum MaybeSendResult {
     Transmit(
         (
             ComputeHookNotifyRequest,
-            tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>,
+            tokio::sync::OwnedMutexGuard<Option<ComputeRemoteState>>,
         ),
     ),
     // Something requires sending, but you must wait for a current sender then call again
-    AwaitLock(Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>),
+    AwaitLock(Arc<tokio::sync::Mutex<Option<ComputeRemoteState>>>),
     // Nothing requires sending
     Noop,
 }
@@ -201,7 +216,7 @@ impl ComputeHookTenant {
     fn maybe_send(
         &self,
         tenant_id: TenantId,
-        lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>>,
+        lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeRemoteState>>>,
     ) -> MaybeSendResult {
         let locked = match lock {
             Some(already_locked) => already_locked,
@@ -257,11 +272,22 @@ impl ComputeHookTenant {
                 tracing::info!("Tenant isn't yet ready to emit a notification");
                 MaybeSendResult::Noop
             }
-            Some(request) if Some(&request) == locked.as_ref() => {
-                // No change from the last value successfully sent
+            Some(request)
+                if Some(&request) == locked.as_ref().map(|s| &s.request)
+                    && locked.as_ref().map(|s| s.applied).unwrap_or(false) =>
+            {
+                tracing::info!(
+                    "Skipping notification because remote state already matches ({:?})",
+                    &request
+                );
+                // No change from the last value successfully sent, and our state indicates that the last
+                // value sent was fully applied on the control plane side.
                 MaybeSendResult::Noop
             }
-            Some(request) => MaybeSendResult::Transmit((request, locked)),
+            Some(request) => {
+                // Our request differs from the last one sent, or the last one sent was not fully applied on the compute side
+                MaybeSendResult::Transmit((request, locked))
+            }
         }
     }
 }
@@ -550,10 +576,28 @@ impl ComputeHook {
             })
         };
 
-        if result.is_ok() {
-            // Before dropping the send lock, stash the request we just sent so that
-            // subsequent callers can avoid redundantly re-sending the same thing.
-            *send_lock_guard = Some(request);
+        match result {
+            Ok(_) => {
+                // Before dropping the send lock, stash the request we just sent so that
+                // subsequent callers can avoid redundantly re-sending the same thing.
+                *send_lock_guard = Some(ComputeRemoteState {
+                    request,
+                    applied: true,
+                });
+            }
+            Err(NotifyError::Busy) => {
+                // Busy result means that the server responded and has stored the new configuration,
+                // but was not able to fully apply it to the compute
+                *send_lock_guard = Some(ComputeRemoteState {
+                    request,
+                    applied: false,
+                });
+            }
+            Err(_) => {
+                // General error case: we can no longer know the remote state, so clear it.  This will result in
+                // the logic in maybe_send recognizing that we should call the hook again.
+                *send_lock_guard = None;
+            }
         }
         result
     }
@@ -707,7 +751,10 @@ pub(crate) mod tests {
         assert!(request.stripe_size.is_none());
 
         // Simulate successful send
-        *guard = Some(request);
+        *guard = Some(ComputeRemoteState {
+            request,
+            applied: true,
+        });
         drop(guard);
 
         // Try asking again: this should be a no-op
@@ -750,7 +797,10 @@ pub(crate) mod tests {
         assert_eq!(request.stripe_size, Some(ShardStripeSize(32768)));
 
         // Simulate successful send
-        *guard = Some(request);
+        *guard = Some(ComputeRemoteState {
+            request,
+            applied: true,
+        });
         drop(guard);
 
         Ok(())
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 1dcc37c407..a4e293da9e 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -576,6 +576,14 @@ def test_storage_controller_compute_hook(
     env.storage_controller.consistency_check()
 
 
+NOTIFY_BLOCKED_LOG = ".*Live migration blocked.*"
+NOTIFY_FAILURE_LOGS = [
+    ".*Failed to notify compute.*",
+    ".*Reconcile error.*Cancelled",
+    ".*Reconcile error.*Control plane tenant busy",
+]
+
+
 def test_storage_controller_stuck_compute_hook(
     httpserver: HTTPServer,
     neon_env_builder: NeonEnvBuilder,
@@ -620,15 +628,8 @@ def test_storage_controller_stuck_compute_hook(
     dest_pageserver = env.get_pageserver(dest_ps_id)
     shard_0_id = TenantShardId(tenant_id, 0, 0)
 
-    NOTIFY_BLOCKED_LOG = ".*Live migration blocked.*"
-    env.storage_controller.allowed_errors.extend(
-        [
-            NOTIFY_BLOCKED_LOG,
-            ".*Failed to notify compute.*",
-            ".*Reconcile error.*Cancelled",
-            ".*Reconcile error.*Control plane tenant busy",
-        ]
-    )
+    env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
+    env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
 
     with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
         # We expect the controller to hit the 423 (locked) and retry.  Migration shouldn't complete until that
@@ -719,6 +720,114 @@ def test_storage_controller_stuck_compute_hook(
     env.storage_controller.consistency_check()
 
 
+@run_only_on_default_postgres("this test doesn't start an endpoint")
+def test_storage_controller_compute_hook_revert(
+    httpserver: HTTPServer,
+    neon_env_builder: NeonEnvBuilder,
+    httpserver_listen_address,
+):
+    """
+    'revert' in the sense of a migration which gets reversed shortly after, as may happen during
+    a rolling upgrade.
+
+    This is a reproducer for https://github.com/neondatabase/neon/issues/9417
+
+    The buggy behavior was that when the compute hook gave us errors, we assumed our last successfully
+    sent state was still in effect, so when migrating back to the original pageserver we didn't bother
+    notifying of that.  This is wrong because even a failed request might mutate the state on the server.
+    """
+
+    # We will run two pageserver to migrate and check that the storage controller sends notifications
+    # when migrating.
+    neon_env_builder.num_pageservers = 2
+    (host, port) = httpserver_listen_address
+    neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify"
+
+    # Set up fake HTTP notify endpoint
+    notifications = []
+
+    handle_params = {"status": 200}
+
+    def handler(request: Request):
+        status = handle_params["status"]
+        log.info(f"Notify request[{status}]: {request}")
+        notifications.append(request.json)
+        return Response(status=status)
+
+    httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
+
+    # Start running
+    env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"})
+    tenant_id = env.initial_tenant
+    tenant_shard_id = TenantShardId(tenant_id, 0, 0)
+
+    pageserver_a = env.get_tenant_pageserver(tenant_id)
+    pageserver_b = [p for p in env.pageservers if p.id != pageserver_a.id][0]
+
+    def notified_ps(ps_id: int) -> None:
+        latest = notifications[-1]
+        log.info(f"Waiting for {ps_id}, have {latest}")
+        assert latest is not None
+        assert latest["shards"] is not None
+        assert latest["shards"][0]["node_id"] == ps_id
+
+    wait_until(30, 1, lambda: notified_ps(pageserver_a.id))
+
+    env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
+    env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
+
+    # Migrate A -> B, and make notifications fail while this is happening
+    handle_params["status"] = 423
+
+    with pytest.raises(StorageControllerApiException, match="Timeout waiting for shard"):
+        # We expect the controller to give us an error because its reconciliation timed out
+        # waiting for the compute hook.
+        env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_b.id)
+
+    # Although the migration API failed, the hook should still see pageserver B (it remembers what
+    # was posted even when returning an error code)
+    wait_until(30, 1, lambda: notified_ps(pageserver_b.id))
+
+    # Although the migration API failed, the tenant should still have moved to the right pageserver
+    assert len(pageserver_b.http_client().tenant_list()) == 1
+
+    # Before we clear the failure on the migration hook, we need the controller to give up
+    # trying to notify about B -- the bug case we're reproducing is when the controller
+    # _never_ successfully notified for B, then tries to notify for A.
+    #
+    # The controller will give up notifying if the origin of a migration becomes unavailable.
+    pageserver_a.stop()
+
+    # Preempt heartbeats for a faster test
+    env.storage_controller.node_configure(pageserver_a.id, {"availability": "Offline"})
+
+    def logged_giving_up():
+        env.storage_controller.assert_log_contains(".*Giving up on compute notification.*")
+
+    wait_until(30, 1, logged_giving_up)
+
+    pageserver_a.start()
+
+    # Preempt heartbeats for determinism
+    env.storage_controller.node_configure(pageserver_a.id, {"availability": "Active"})
+    # Starting node will prompt a reconcile to clean up old AttachedStale location, for a deterministic test
+    # we want that complete before we start our migration.  Tolerate failure because our compute hook is
+    # still configured to fail
+    try:
+        env.storage_controller.reconcile_all()
+    except StorageControllerApiException as e:
+        # This exception _might_ be raised: it depends if our reconcile_all hit the on-node-activation
+        # Reconciler lifetime or ran after it already completed.
+        log.info(f"Expected error from reconcile_all: {e}")
+
+    # Migrate B -> A, with a working compute hook: the controller should notify the hook because the
+    # last update it made that was acked (423) by the compute was for node B.
+    handle_params["status"] = 200
+    env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_a.id)
+
+    wait_until(30, 1, lambda: notified_ps(pageserver_a.id))
+
+
 def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
     """
     Verify that occasional-use debug APIs work as expected.  This is a lightweight test

From 98fee7a97d68db55049583d403dcb21755bc4db5 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Fri, 18 Oct 2024 13:31:14 +0300
Subject: [PATCH 50/57] Increase shared_buffers in
 test_subscriber_synchronous_commit. (#9427)

Might make the test less flaky.
---
 test_runner/regress/test_logical_replication.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 87991eadf1..c26bf058e2 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -558,10 +558,10 @@ select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication
     return publisher_flush_lsn
 
 
-# Test that subscriber takes into account quorum committed flush_lsn in
-# flush_lsn reporting to publisher. Without this, it may ack too far, losing
-# data on restart because publisher advances START_REPLICATION position to the
-# confirmed_flush_lsn of the slot.
+# Test that neon subscriber takes into account quorum committed flush_lsn in
+# flush_lsn reporting to publisher. Without this, subscriber may ack too far,
+# losing data on restart because publisher implicitly advances positition given
+# in START_REPLICATION to the confirmed_flush_lsn of the slot.
 def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
     # use vanilla as publisher to allow writes on it when safekeeper is down
@@ -578,7 +578,10 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg):
     vanilla_pg.safe_psql("create extension neon;")
 
     env.create_branch("subscriber")
-    sub = env.endpoints.create("subscriber")
+    # We want all data to fit into shared_buffers because later we stop
+    # safekeeper and insert more; this shouldn't cause page requests as they
+    # will be stuck.
+    sub = env.endpoints.create("subscriber", config_lines=["shared_buffers=128MB"])
     sub.start()
 
     with vanilla_pg.cursor() as pcur:

From 15fecffe6ba400693619c6a022ed6205769a61ae Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Fri, 18 Oct 2024 12:42:41 +0200
Subject: [PATCH 51/57] Update ruff to much newer version (#9433)

Includes a multidict patch release to fix build with newer cpython.
---
 poetry.lock                                   | 207 ++++++++++--------
 pyproject.toml                                |   2 +-
 test_runner/fixtures/neon_cli.py              |   4 +-
 test_runner/fixtures/neon_fixtures.py         |  18 +-
 test_runner/fixtures/utils.py                 |   2 +-
 .../performance/test_logical_replication.py   |  14 +-
 .../performance/test_physical_replication.py  |  12 +-
 .../regress/test_download_extensions.py       |   2 +-
 test_runner/regress/test_next_xid.py          |   4 +-
 test_runner/regress/test_timeline_delete.py   |   2 +-
 10 files changed, 145 insertions(+), 122 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 00fe2505c9..e307b873f3 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -1758,85 +1758,101 @@ tests = ["pytest (>=4.6)"]
 
 [[package]]
 name = "multidict"
-version = "6.0.4"
+version = "6.0.5"
 description = "multidict implementation"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8"},
-    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171"},
-    {file = "multidict-6.0.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5"},
-    {file = "multidict-6.0.4-cp310-cp310-win32.whl", hash = "sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8"},
-    {file = "multidict-6.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc"},
-    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03"},
-    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3"},
-    {file = "multidict-6.0.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461"},
-    {file = "multidict-6.0.4-cp311-cp311-win32.whl", hash = "sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636"},
-    {file = "multidict-6.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0"},
-    {file = "multidict-6.0.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:67040058f37a2a51ed8ea8f6b0e6ee5bd78ca67f169ce6122f3e2ec80dfe9b78"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:853888594621e6604c978ce2a0444a1e6e70c8d253ab65ba11657659dcc9100f"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:39ff62e7d0f26c248b15e364517a72932a611a9b75f35b45be078d81bdb86603"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:af048912e045a2dc732847d33821a9d84ba553f5c5f028adbd364dd4765092ac"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e8b901e607795ec06c9e42530788c45ac21ef3aaa11dbd0c69de543bfb79a9"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62501642008a8b9871ddfccbf83e4222cf8ac0d5aeedf73da36153ef2ec222d2"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:99b76c052e9f1bc0721f7541e5e8c05db3941eb9ebe7b8553c625ef88d6eefde"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:509eac6cf09c794aa27bcacfd4d62c885cce62bef7b2c3e8b2e49d365b5003fe"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:21a12c4eb6ddc9952c415f24eef97e3e55ba3af61f67c7bc388dcdec1404a067"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:5cad9430ab3e2e4fa4a2ef4450f548768400a2ac635841bc2a56a2052cdbeb87"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab55edc2e84460694295f401215f4a58597f8f7c9466faec545093045476327d"},
-    {file = "multidict-6.0.4-cp37-cp37m-win32.whl", hash = "sha256:5a4dcf02b908c3b8b17a45fb0f15b695bf117a67b76b7ad18b73cf8e92608775"},
-    {file = "multidict-6.0.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6ed5f161328b7df384d71b07317f4d8656434e34591f20552c7bcef27b0ab88e"},
-    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5fc1b16f586f049820c5c5b17bb4ee7583092fa0d1c4e28b5239181ff9532e0c"},
-    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1502e24330eb681bdaa3eb70d6358e818e8e8f908a22a1851dfd4e15bc2f8161"},
-    {file = "multidict-6.0.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b692f419760c0e65d060959df05f2a531945af31fda0c8a3b3195d4efd06de11"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45e1ecb0379bfaab5eef059f50115b54571acfbe422a14f668fc8c27ba410e7e"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ddd3915998d93fbcd2566ddf9cf62cdb35c9e093075f862935573d265cf8f65d"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:59d43b61c59d82f2effb39a93c48b845efe23a3852d201ed2d24ba830d0b4cf2"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc8e1d0c705233c5dd0c5e6460fbad7827d5d36f310a0fadfd45cc3029762258"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6aa0418fcc838522256761b3415822626f866758ee0bc6632c9486b179d0b52"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6748717bb10339c4760c1e63da040f5f29f5ed6e59d76daee30305894069a660"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4d1a3d7ef5e96b1c9e92f973e43aa5e5b96c659c9bc3124acbbd81b0b9c8a951"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4372381634485bec7e46718edc71528024fcdc6f835baefe517b34a33c731d60"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:fc35cb4676846ef752816d5be2193a1e8367b4c1397b74a565a9d0389c433a1d"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4b9d9e4e2b37daddb5c23ea33a3417901fa7c7b3dee2d855f63ee67a0b21e5b1"},
-    {file = "multidict-6.0.4-cp38-cp38-win32.whl", hash = "sha256:e41b7e2b59679edfa309e8db64fdf22399eec4b0b24694e1b2104fb789207779"},
-    {file = "multidict-6.0.4-cp38-cp38-win_amd64.whl", hash = "sha256:d6c254ba6e45d8e72739281ebc46ea5eb5f101234f3ce171f0e9f5cc86991480"},
-    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664"},
-    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35"},
-    {file = "multidict-6.0.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95"},
-    {file = "multidict-6.0.4-cp39-cp39-win32.whl", hash = "sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313"},
-    {file = "multidict-6.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2"},
-    {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"},
+    {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:228b644ae063c10e7f324ab1ab6b548bdf6f8b47f3ec234fef1093bc2735e5f9"},
+    {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:896ebdcf62683551312c30e20614305f53125750803b614e9e6ce74a96232604"},
+    {file = "multidict-6.0.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:411bf8515f3be9813d06004cac41ccf7d1cd46dfe233705933dd163b60e37600"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d147090048129ce3c453f0292e7697d333db95e52616b3793922945804a433c"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:215ed703caf15f578dca76ee6f6b21b7603791ae090fbf1ef9d865571039ade5"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c6390cf87ff6234643428991b7359b5f59cc15155695deb4eda5c777d2b880f"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21fd81c4ebdb4f214161be351eb5bcf385426bf023041da2fd9e60681f3cebae"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3cc2ad10255f903656017363cd59436f2111443a76f996584d1077e43ee51182"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6939c95381e003f54cd4c5516740faba40cf5ad3eeff460c3ad1d3e0ea2549bf"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:220dd781e3f7af2c2c1053da9fa96d9cf3072ca58f057f4c5adaaa1cab8fc442"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:766c8f7511df26d9f11cd3a8be623e59cca73d44643abab3f8c8c07620524e4a"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c1c1496e73051918fcd4f58ff2e0f2f3066d1c76a0c6aeffd9b45d53243702cc"},
+    {file = "multidict-6.0.5-cp310-cp310-win32.whl", hash = "sha256:7afcdd1fc07befad18ec4523a782cde4e93e0a2bf71239894b8d61ee578c1319"},
+    {file = "multidict-6.0.5-cp310-cp310-win_amd64.whl", hash = "sha256:99f60d34c048c5c2fabc766108c103612344c46e35d4ed9ae0673d33c8fb26e8"},
+    {file = "multidict-6.0.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f285e862d2f153a70586579c15c44656f888806ed0e5b56b64489afe4a2dbfba"},
+    {file = "multidict-6.0.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:53689bb4e102200a4fafa9de9c7c3c212ab40a7ab2c8e474491914d2305f187e"},
+    {file = "multidict-6.0.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:612d1156111ae11d14afaf3a0669ebf6c170dbb735e510a7438ffe2369a847fd"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7be7047bd08accdb7487737631d25735c9a04327911de89ff1b26b81745bd4e3"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de170c7b4fe6859beb8926e84f7d7d6c693dfe8e27372ce3b76f01c46e489fcf"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:04bde7a7b3de05732a4eb39c94574db1ec99abb56162d6c520ad26f83267de29"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85f67aed7bb647f93e7520633d8f51d3cbc6ab96957c71272b286b2f30dc70ed"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425bf820055005bfc8aa9a0b99ccb52cc2f4070153e34b701acc98d201693733"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d3eb1ceec286eba8220c26f3b0096cf189aea7057b6e7b7a2e60ed36b373b77f"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7901c05ead4b3fb75113fb1dd33eb1253c6d3ee37ce93305acd9d38e0b5f21a4"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e0e79d91e71b9867c73323a3444724d496c037e578a0e1755ae159ba14f4f3d1"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:29bfeb0dff5cb5fdab2023a7a9947b3b4af63e9c47cae2a10ad58394b517fddc"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e030047e85cbcedbfc073f71836d62dd5dadfbe7531cae27789ff66bc551bd5e"},
+    {file = "multidict-6.0.5-cp311-cp311-win32.whl", hash = "sha256:2f4848aa3baa109e6ab81fe2006c77ed4d3cd1e0ac2c1fbddb7b1277c168788c"},
+    {file = "multidict-6.0.5-cp311-cp311-win_amd64.whl", hash = "sha256:2faa5ae9376faba05f630d7e5e6be05be22913782b927b19d12b8145968a85ea"},
+    {file = "multidict-6.0.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:51d035609b86722963404f711db441cf7134f1889107fb171a970c9701f92e1e"},
+    {file = "multidict-6.0.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cbebcd5bcaf1eaf302617c114aa67569dd3f090dd0ce8ba9e35e9985b41ac35b"},
+    {file = "multidict-6.0.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2ffc42c922dbfddb4a4c3b438eb056828719f07608af27d163191cb3e3aa6cc5"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ceb3b7e6a0135e092de86110c5a74e46bda4bd4fbfeeb3a3bcec79c0f861e450"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:79660376075cfd4b2c80f295528aa6beb2058fd289f4c9252f986751a4cd0496"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e4428b29611e989719874670fd152b6625500ad6c686d464e99f5aaeeaca175a"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d84a5c3a5f7ce6db1f999fb9438f686bc2e09d38143f2d93d8406ed2dd6b9226"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:76c0de87358b192de7ea9649beb392f107dcad9ad27276324c24c91774ca5271"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:79a6d2ba910adb2cbafc95dad936f8b9386e77c84c35bc0add315b856d7c3abb"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:92d16a3e275e38293623ebf639c471d3e03bb20b8ebb845237e0d3664914caef"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:fb616be3538599e797a2017cccca78e354c767165e8858ab5116813146041a24"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:14c2976aa9038c2629efa2c148022ed5eb4cb939e15ec7aace7ca932f48f9ba6"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:435a0984199d81ca178b9ae2c26ec3d49692d20ee29bc4c11a2a8d4514c67eda"},
+    {file = "multidict-6.0.5-cp312-cp312-win32.whl", hash = "sha256:9fe7b0653ba3d9d65cbe7698cca585bf0f8c83dbbcc710db9c90f478e175f2d5"},
+    {file = "multidict-6.0.5-cp312-cp312-win_amd64.whl", hash = "sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556"},
+    {file = "multidict-6.0.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:19fe01cea168585ba0f678cad6f58133db2aa14eccaf22f88e4a6dccadfad8b3"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6bf7a982604375a8d49b6cc1b781c1747f243d91b81035a9b43a2126c04766f5"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:107c0cdefe028703fb5dafe640a409cb146d44a6ae201e55b35a4af8e95457dd"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:403c0911cd5d5791605808b942c88a8155c2592e05332d2bf78f18697a5fa15e"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aeaf541ddbad8311a87dd695ed9642401131ea39ad7bc8cf3ef3967fd093b626"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e4972624066095e52b569e02b5ca97dbd7a7ddd4294bf4e7247d52635630dd83"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d946b0a9eb8aaa590df1fe082cee553ceab173e6cb5b03239716338629c50c7a"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b55358304d7a73d7bdf5de62494aaf70bd33015831ffd98bc498b433dfe5b10c"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:a3145cb08d8625b2d3fee1b2d596a8766352979c9bffe5d7833e0503d0f0b5e5"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d65f25da8e248202bd47445cec78e0025c0fe7582b23ec69c3b27a640dd7a8e3"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c9bf56195c6bbd293340ea82eafd0071cb3d450c703d2c93afb89f93b8386ccc"},
+    {file = "multidict-6.0.5-cp37-cp37m-win32.whl", hash = "sha256:69db76c09796b313331bb7048229e3bee7928eb62bab5e071e9f7fcc4879caee"},
+    {file = "multidict-6.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423"},
+    {file = "multidict-6.0.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:76f067f5121dcecf0d63a67f29080b26c43c71a98b10c701b0677e4a065fbd54"},
+    {file = "multidict-6.0.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b82cc8ace10ab5bd93235dfaab2021c70637005e1ac787031f4d1da63d493c1d"},
+    {file = "multidict-6.0.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5cb241881eefd96b46f89b1a056187ea8e9ba14ab88ba632e68d7a2ecb7aadf7"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8e94e6912639a02ce173341ff62cc1201232ab86b8a8fcc05572741a5dc7d93"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:09a892e4a9fb47331da06948690ae38eaa2426de97b4ccbfafbdcbe5c8f37ff8"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55205d03e8a598cfc688c71ca8ea5f66447164efff8869517f175ea632c7cb7b"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37b15024f864916b4951adb95d3a80c9431299080341ab9544ed148091b53f50"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2a1dee728b52b33eebff5072817176c172050d44d67befd681609b4746e1c2e"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:edd08e6f2f1a390bf137080507e44ccc086353c8e98c657e666c017718561b89"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:60d698e8179a42ec85172d12f50b1668254628425a6bd611aba022257cac1386"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:3d25f19500588cbc47dc19081d78131c32637c25804df8414463ec908631e453"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:4cc0ef8b962ac7a5e62b9e826bd0cd5040e7d401bc45a6835910ed699037a461"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:eca2e9d0cc5a889850e9bbd68e98314ada174ff6ccd1129500103df7a94a7a44"},
+    {file = "multidict-6.0.5-cp38-cp38-win32.whl", hash = "sha256:4a6a4f196f08c58c59e0b8ef8ec441d12aee4125a7d4f4fef000ccb22f8d7241"},
+    {file = "multidict-6.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:0275e35209c27a3f7951e1ce7aaf93ce0d163b28948444bec61dd7badc6d3f8c"},
+    {file = "multidict-6.0.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e7be68734bd8c9a513f2b0cfd508802d6609da068f40dc57d4e3494cefc92929"},
+    {file = "multidict-6.0.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1d9ea7a7e779d7a3561aade7d596649fbecfa5c08a7674b11b423783217933f9"},
+    {file = "multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ea1456df2a27c73ce51120fa2f519f1bea2f4a03a917f4a43c8707cf4cbbae1a"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf590b134eb70629e350691ecca88eac3e3b8b3c86992042fb82e3cb1830d5e1"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5c0631926c4f58e9a5ccce555ad7747d9a9f8b10619621f22f9635f069f6233e"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dce1c6912ab9ff5f179eaf6efe7365c1f425ed690b03341911bf4939ef2f3046"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0868d64af83169e4d4152ec612637a543f7a336e4a307b119e98042e852ad9c"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:141b43360bfd3bdd75f15ed811850763555a251e38b2405967f8e25fb43f7d40"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7df704ca8cf4a073334e0427ae2345323613e4df18cc224f647f251e5e75a527"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6214c5a5571802c33f80e6c84713b2c79e024995b9c5897f794b43e714daeec9"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:cd6c8fca38178e12c00418de737aef1261576bd1b6e8c6134d3e729a4e858b38"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e02021f87a5b6932fa6ce916ca004c4d441509d33bbdbeca70d05dff5e9d2479"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ebd8d160f91a764652d3e51ce0d2956b38efe37c9231cd82cfc0bed2e40b581c"},
+    {file = "multidict-6.0.5-cp39-cp39-win32.whl", hash = "sha256:04da1bb8c8dbadf2a18a452639771951c662c5ad03aefe4884775454be322c9b"},
+    {file = "multidict-6.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:d6f6d4f185481c9669b9447bf9d9cf3b95a0e9df9d169bbc17e363b7d5487755"},
+    {file = "multidict-6.0.5-py3-none-any.whl", hash = "sha256:0d63c74e3d7ab26de115c49bffc92cc77ed23395303d496eae515d4204a625e7"},
+    {file = "multidict-6.0.5.tar.gz", hash = "sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da"},
 ]
 
 [[package]]
@@ -2766,28 +2782,29 @@ six = "*"
 
 [[package]]
 name = "ruff"
-version = "0.2.2"
+version = "0.7.0"
 description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:0a9efb032855ffb3c21f6405751d5e147b0c6b631e3ca3f6b20f917572b97eb6"},
-    {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:d450b7fbff85913f866a5384d8912710936e2b96da74541c82c1b458472ddb39"},
-    {file = "ruff-0.2.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecd46e3106850a5c26aee114e562c329f9a1fbe9e4821b008c4404f64ff9ce73"},
-    {file = "ruff-0.2.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e22676a5b875bd72acd3d11d5fa9075d3a5f53b877fe7b4793e4673499318ba"},
-    {file = "ruff-0.2.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1695700d1e25a99d28f7a1636d85bafcc5030bba9d0578c0781ba1790dbcf51c"},
-    {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b0c232af3d0bd8f521806223723456ffebf8e323bd1e4e82b0befb20ba18388e"},
-    {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f63d96494eeec2fc70d909393bcd76c69f35334cdbd9e20d089fb3f0640216ca"},
-    {file = "ruff-0.2.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a61ea0ff048e06de273b2e45bd72629f470f5da8f71daf09fe481278b175001"},
-    {file = "ruff-0.2.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e1439c8f407e4f356470e54cdecdca1bd5439a0673792dbe34a2b0a551a2fe3"},
-    {file = "ruff-0.2.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:940de32dc8853eba0f67f7198b3e79bc6ba95c2edbfdfac2144c8235114d6726"},
-    {file = "ruff-0.2.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0c126da55c38dd917621552ab430213bdb3273bb10ddb67bc4b761989210eb6e"},
-    {file = "ruff-0.2.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:3b65494f7e4bed2e74110dac1f0d17dc8e1f42faaa784e7c58a98e335ec83d7e"},
-    {file = "ruff-0.2.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1ec49be4fe6ddac0503833f3ed8930528e26d1e60ad35c2446da372d16651ce9"},
-    {file = "ruff-0.2.2-py3-none-win32.whl", hash = "sha256:d920499b576f6c68295bc04e7b17b6544d9d05f196bb3aac4358792ef6f34325"},
-    {file = "ruff-0.2.2-py3-none-win_amd64.whl", hash = "sha256:cc9a91ae137d687f43a44c900e5d95e9617cb37d4c989e462980ba27039d239d"},
-    {file = "ruff-0.2.2-py3-none-win_arm64.whl", hash = "sha256:c9d15fc41e6054bfc7200478720570078f0b41c9ae4f010bcc16bd6f4d1aacdd"},
-    {file = "ruff-0.2.2.tar.gz", hash = "sha256:e62ed7f36b3068a30ba39193a14274cd706bc486fad521276458022f7bccb31d"},
+    {file = "ruff-0.7.0-py3-none-linux_armv6l.whl", hash = "sha256:0cdf20c2b6ff98e37df47b2b0bd3a34aaa155f59a11182c1303cce79be715628"},
+    {file = "ruff-0.7.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:496494d350c7fdeb36ca4ef1c9f21d80d182423718782222c29b3e72b3512737"},
+    {file = "ruff-0.7.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:214b88498684e20b6b2b8852c01d50f0651f3cc6118dfa113b4def9f14faaf06"},
+    {file = "ruff-0.7.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:630fce3fefe9844e91ea5bbf7ceadab4f9981f42b704fae011bb8efcaf5d84be"},
+    {file = "ruff-0.7.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:211d877674e9373d4bb0f1c80f97a0201c61bcd1e9d045b6e9726adc42c156aa"},
+    {file = "ruff-0.7.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:194d6c46c98c73949a106425ed40a576f52291c12bc21399eb8f13a0f7073495"},
+    {file = "ruff-0.7.0-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:82c2579b82b9973a110fab281860403b397c08c403de92de19568f32f7178598"},
+    {file = "ruff-0.7.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9af971fe85dcd5eaed8f585ddbc6bdbe8c217fb8fcf510ea6bca5bdfff56040e"},
+    {file = "ruff-0.7.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b641c7f16939b7d24b7bfc0be4102c56562a18281f84f635604e8a6989948914"},
+    {file = "ruff-0.7.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d71672336e46b34e0c90a790afeac8a31954fd42872c1f6adaea1dff76fd44f9"},
+    {file = "ruff-0.7.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:ab7d98c7eed355166f367597e513a6c82408df4181a937628dbec79abb2a1fe4"},
+    {file = "ruff-0.7.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:1eb54986f770f49edb14f71d33312d79e00e629a57387382200b1ef12d6a4ef9"},
+    {file = "ruff-0.7.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:dc452ba6f2bb9cf8726a84aa877061a2462afe9ae0ea1d411c53d226661c601d"},
+    {file = "ruff-0.7.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:4b406c2dce5be9bad59f2de26139a86017a517e6bcd2688da515481c05a2cb11"},
+    {file = "ruff-0.7.0-py3-none-win32.whl", hash = "sha256:f6c968509f767776f524a8430426539587d5ec5c662f6addb6aa25bc2e8195ec"},
+    {file = "ruff-0.7.0-py3-none-win_amd64.whl", hash = "sha256:ff4aabfbaaba880e85d394603b9e75d32b0693152e16fa659a3064a85df7fce2"},
+    {file = "ruff-0.7.0-py3-none-win_arm64.whl", hash = "sha256:10842f69c245e78d6adec7e1db0a7d9ddc2fff0621d730e61657b64fa36f207e"},
+    {file = "ruff-0.7.0.tar.gz", hash = "sha256:47a86360cf62d9cd53ebfb0b5eb0e882193fc191c6d717e8bef4462bc3b9ea2b"},
 ]
 
 [[package]]
@@ -3389,4 +3406,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "9055b73352f1534f664cd8af6ebf8d93cf3bf857f115756f312ff2e3ae1bbbc1"
+content-hash = "f52632571e34b0e51b059c280c35d6ff6f69f6a8c9586caca78282baf635be91"
diff --git a/pyproject.toml b/pyproject.toml
index 9cd315bb96..862ed49638 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,7 +45,7 @@ kafka-python = "^2.0.2"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
-ruff = "^0.2.2"
+ruff = "^0.7.0"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index 0d3dcd1671..1b2767e296 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import abc
 import json
 import os
 import re
@@ -30,7 +29,8 @@ if TYPE_CHECKING:
     T = TypeVar("T")
 
 
-class AbstractNeonCli(abc.ABC):
+# Used to be an ABC. abc.ABC removed due to linter without name change.
+class AbstractNeonCli:
     """
     A typed wrapper around an arbitrary Neon CLI tool.
     Supports a way to run arbitrary command directly via CLI.
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a313ac2ed3..3cd8019e32 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -386,9 +386,9 @@ class NeonEnvBuilder:
 
         self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
 
-        self.pageserver_default_tenant_config_compaction_algorithm: Optional[
-            dict[str, Any]
-        ] = pageserver_default_tenant_config_compaction_algorithm
+        self.pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]] = (
+            pageserver_default_tenant_config_compaction_algorithm
+        )
         if self.pageserver_default_tenant_config_compaction_algorithm is not None:
             log.debug(
                 f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}"
@@ -1062,9 +1062,9 @@ class NeonEnv:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
             if config.pageserver_default_tenant_config_compaction_algorithm is not None:
                 tenant_config = ps_cfg.setdefault("tenant_config", {})
-                tenant_config[
-                    "compaction_algorithm"
-                ] = config.pageserver_default_tenant_config_compaction_algorithm
+                tenant_config["compaction_algorithm"] = (
+                    config.pageserver_default_tenant_config_compaction_algorithm
+                )
 
             if self.pageserver_remote_storage is not None:
                 ps_cfg["remote_storage"] = remote_storage_to_toml_dict(
@@ -1108,9 +1108,9 @@ class NeonEnv:
             if config.auth_enabled:
                 sk_cfg["auth_enabled"] = True
             if self.safekeepers_remote_storage is not None:
-                sk_cfg[
-                    "remote_storage"
-                ] = self.safekeepers_remote_storage.to_toml_inline_table().strip()
+                sk_cfg["remote_storage"] = (
+                    self.safekeepers_remote_storage.to_toml_inline_table().strip()
+                )
             self.safekeepers.append(
                 Safekeeper(env=self, id=id, port=port, extra_opts=config.safekeeper_extra_opts)
             )
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 76575d330c..7ca6b3dd1c 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -417,7 +417,7 @@ def wait_until(
             time.sleep(interval)
             continue
         return res
-    raise Exception("timed out while waiting for %s" % func) from last_exception
+    raise Exception(f"timed out while waiting for {func}") from last_exception
 
 
 def assert_eq(a, b) -> None:
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index dbf94a2cf5..815d186ab9 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -144,9 +144,10 @@ def test_subscriber_lag(
                 check_pgbench_still_running(pub_workload, "pub")
                 check_pgbench_still_running(sub_workload, "sub")
 
-                with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
-                    sub_connstr
-                ) as sub_conn:
+                with (
+                    psycopg2.connect(pub_connstr) as pub_conn,
+                    psycopg2.connect(sub_connstr) as sub_conn,
+                ):
                     with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
                         lag = measure_logical_replication_lag(sub_cur, pub_cur)
 
@@ -242,9 +243,10 @@ def test_publisher_restart(
                     ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
                     env=pub_env,
                 )
-                with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
-                    sub_connstr
-                ) as sub_conn:
+                with (
+                    psycopg2.connect(pub_connstr) as pub_conn,
+                    psycopg2.connect(sub_connstr) as sub_conn,
+                ):
                     with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
                         lag = measure_logical_replication_lag(sub_cur, pub_cur)
 
diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py
index 14b527acca..8b368977df 100644
--- a/test_runner/performance/test_physical_replication.py
+++ b/test_runner/performance/test_physical_replication.py
@@ -102,10 +102,14 @@ def test_ro_replica_lag(
                     check_pgbench_still_running(master_workload)
                     check_pgbench_still_running(replica_workload)
                     time.sleep(sync_interval_min * 60)
-                    with psycopg2.connect(master_connstr) as conn_master, psycopg2.connect(
-                        replica_connstr
-                    ) as conn_replica:
-                        with conn_master.cursor() as cur_master, conn_replica.cursor() as cur_replica:
+                    with (
+                        psycopg2.connect(master_connstr) as conn_master,
+                        psycopg2.connect(replica_connstr) as conn_replica,
+                    ):
+                        with (
+                            conn_master.cursor() as cur_master,
+                            conn_replica.cursor() as cur_replica,
+                        ):
                             lag = measure_replication_lag(cur_master, cur_replica)
                     log.info(f"Replica lagged behind master by {lag} seconds")
                     zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index 04916a6b6f..0134f80769 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -74,7 +74,7 @@ def test_remote_extensions(
             mimetype="application/octet-stream",
             headers=[
                 ("Content-Length", str(file_size)),
-                ("Content-Disposition", 'attachment; filename="%s"' % file_name),
+                ("Content-Disposition", f'attachment; filename="{file_name}"'),
             ],
             direct_passthrough=True,
         )
diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py
index 980f6b5694..db8da51125 100644
--- a/test_runner/regress/test_next_xid.py
+++ b/test_runner/regress/test_next_xid.py
@@ -254,13 +254,13 @@ def advance_multixid_to(
     # missing. That's OK for our purposes. Autovacuum will print some warnings about the
     # missing segments, but will clean it up by truncating the SLRUs up to the new value,
     # closing the gap.
-    segname = "%04X" % MultiXactIdToOffsetSegment(next_multi_xid)
+    segname = f"{MultiXactIdToOffsetSegment(next_multi_xid):04X}"
     log.info(f"Creating dummy segment pg_multixact/offsets/{segname}")
     with open(vanilla_pg.pgdatadir / "pg_multixact" / "offsets" / segname, "w") as of:
         of.write("\0" * SLRU_PAGES_PER_SEGMENT * BLCKSZ)
         of.flush()
 
-    segname = "%04X" % MXOffsetToMemberSegment(next_multi_offset)
+    segname = f"{MXOffsetToMemberSegment(next_multi_offset):04X}"
     log.info(f"Creating dummy segment pg_multixact/members/{segname}")
     with open(vanilla_pg.pgdatadir / "pg_multixact" / "members" / segname, "w") as of:
         of.write("\0" * SLRU_PAGES_PER_SEGMENT * BLCKSZ)
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 306f22acf9..155709e106 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -649,7 +649,7 @@ def test_timeline_delete_works_for_remote_smoke(
     env = neon_env_builder.init_start()
 
     ps_http = env.pageserver.http_client()
-    pg = env.endpoints.create_start("main")
+    env.endpoints.create_start("main")
 
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline

From 3532ae76ef3a91131aee1f203a133c4d5e32b57a Mon Sep 17 00:00:00 2001
From: Jere Vaara <jerevaara@gmail.com>
Date: Fri, 18 Oct 2024 15:07:36 +0300
Subject: [PATCH 52/57] compute_ctl: Add endpoint that allows extensions to be
 installed (#9344)

Adds endpoint to install extensions:

**POST** `/extensions`
```
{"extension":"pg_sessions_jwt","database":"neondb","version":"1.0.0"}
```

Will be used by `local-proxy`.
Example, for the JWT authentication to work the database needs to have
the pg_session_jwt extension and also to enable JWT to work in RLS
policies.

---------

Co-authored-by: Conrad Ludgate <conradludgate@gmail.com>
---
 compute_tools/src/compute.rs             | 52 +++++++++++++++++-
 compute_tools/src/http/api.rs            | 37 ++++++++++++-
 compute_tools/src/http/openapi_spec.yaml | 69 +++++++++++++++++++++++-
 libs/compute_api/src/requests.rs         | 10 +++-
 libs/compute_api/src/responses.rs        |  7 ++-
 libs/compute_api/src/spec.rs             |  3 ++
 test_runner/fixtures/endpoint/http.py    | 10 ++++
 test_runner/regress/test_extensions.py   | 50 +++++++++++++++++
 8 files changed, 231 insertions(+), 7 deletions(-)
 create mode 100644 test_runner/regress/test_extensions.py

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 11fee73f03..c9dd4dcfc5 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -28,7 +28,7 @@ use utils::lsn::Lsn;
 
 use compute_api::privilege::Privilege;
 use compute_api::responses::{ComputeMetrics, ComputeStatus};
-use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
+use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec, ExtVersion};
 use utils::measured_stream::MeasuredReader;
 
 use nix::sys::signal::{kill, Signal};
@@ -1416,6 +1416,56 @@ LIMIT 100",
         Ok(())
     }
 
+    pub async fn install_extension(
+        &self,
+        ext_name: &PgIdent,
+        db_name: &PgIdent,
+        ext_version: ExtVersion,
+    ) -> Result<ExtVersion> {
+        use tokio_postgres::config::Config;
+        use tokio_postgres::NoTls;
+
+        let mut conf = Config::from_str(self.connstr.as_str()).unwrap();
+        conf.dbname(db_name);
+
+        let (db_client, conn) = conf
+            .connect(NoTls)
+            .await
+            .context("Failed to connect to the database")?;
+        tokio::spawn(conn);
+
+        let version_query = "SELECT extversion FROM pg_extension WHERE extname = $1";
+        let version: Option<ExtVersion> = db_client
+            .query_opt(version_query, &[&ext_name])
+            .await
+            .with_context(|| format!("Failed to execute query: {}", version_query))?
+            .map(|row| row.get(0));
+
+        // sanitize the inputs as postgres idents.
+        let ext_name: String = ext_name.pg_quote();
+        let quoted_version: String = ext_version.pg_quote();
+
+        if let Some(installed_version) = version {
+            if installed_version == ext_version {
+                return Ok(installed_version);
+            }
+            let query = format!("ALTER EXTENSION {ext_name} UPDATE TO {quoted_version}");
+            db_client
+                .simple_query(&query)
+                .await
+                .with_context(|| format!("Failed to execute query: {}", query))?;
+        } else {
+            let query =
+                format!("CREATE EXTENSION IF NOT EXISTS {ext_name} WITH VERSION {quoted_version}");
+            db_client
+                .simple_query(&query)
+                .await
+                .with_context(|| format!("Failed to execute query: {}", query))?;
+        }
+
+        Ok(ext_version)
+    }
+
     #[tokio::main]
     pub async fn prepare_preload_libraries(
         &self,
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 133ab9f5af..af35f71bf2 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -9,9 +9,10 @@ use crate::catalog::SchemaDumpError;
 use crate::catalog::{get_database_schema, get_dbs_and_roles};
 use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
-use compute_api::requests::{ConfigurationRequest, SetRoleGrantsRequest};
+use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest};
 use compute_api::responses::{
-    ComputeStatus, ComputeStatusResponse, GenericAPIError, SetRoleGrantsResponse,
+    ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError,
+    SetRoleGrantsResponse,
 };
 
 use anyhow::Result;
@@ -100,6 +101,38 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
             }
         }
 
+        (&Method::POST, "/extensions") => {
+            info!("serving /extensions POST request");
+            let status = compute.get_status();
+            if status != ComputeStatus::Running {
+                let msg = format!(
+                    "invalid compute status for extensions request: {:?}",
+                    status
+                );
+                error!(msg);
+                return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
+            }
+
+            let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
+            let request = serde_json::from_slice::<ExtensionInstallRequest>(&request).unwrap();
+            let res = compute
+                .install_extension(&request.extension, &request.database, request.version)
+                .await;
+            match res {
+                Ok(version) => render_json(Body::from(
+                    serde_json::to_string(&ExtensionInstallResult {
+                        extension: request.extension,
+                        version,
+                    })
+                    .unwrap(),
+                )),
+                Err(e) => {
+                    error!("install_extension failed: {}", e);
+                    render_json_error(&e.to_string(), StatusCode::INTERNAL_SERVER_ERROR)
+                }
+            }
+        }
+
         (&Method::GET, "/info") => {
             let num_cpus = num_cpus::get_physical();
             info!("serving /info GET request. num_cpus: {}", num_cpus);
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index 73dbdc3ee9..11eee6ccfd 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -179,6 +179,41 @@ paths:
                 description: Error text or 'true' if check passed.
                 example: "true"
 
+  /extensions:
+    post:
+      tags:
+        - Extensions
+      summary: Install extension if possible.
+      description: ""
+      operationId: installExtension
+      requestBody:
+        description: Extension name and database to install it to.
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/ExtensionInstallRequest"
+      responses:
+        200:
+          description: Result from extension installation
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ExtensionInstallResult"
+        412:
+          description: |
+            Compute is in the wrong state for processing the request.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+        500:
+          description: Error during extension installation.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+
   /configure:
     post:
       tags:
@@ -404,7 +439,7 @@ components:
             moment, when spec was received.
           example: "2022-10-12T07:20:50.52Z"
         status:
-          $ref: '#/components/schemas/ComputeStatus'
+          $ref: "#/components/schemas/ComputeStatus"
         last_active:
           type: string
           description: |
@@ -444,6 +479,38 @@ components:
         - configuration
       example: running
 
+    ExtensionInstallRequest:
+      type: object
+      required:
+        - extension
+        - database
+        - version
+      properties:
+        extension:
+          type: string
+          description: Extension name.
+          example: "pg_session_jwt"
+        version:
+          type: string
+          description: Version of the extension.
+          example: "1.0.0"
+        database:
+          type: string
+          description: Database name.
+          example: "neondb"
+
+    ExtensionInstallResult:
+      type: object
+      properties:
+        extension:
+          description: Name of the extension.
+          type: string
+          example: "pg_session_jwt"
+        version:
+          description: Version of the extension.
+          type: string
+          example: "1.0.0"
+
     InstalledExtensions:
       type: object
       properties:
diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs
index fbc7577dd9..fc3757d981 100644
--- a/libs/compute_api/src/requests.rs
+++ b/libs/compute_api/src/requests.rs
@@ -1,8 +1,7 @@
 //! Structs representing the JSON formats used in the compute_ctl's HTTP API.
-
 use crate::{
     privilege::Privilege,
-    spec::{ComputeSpec, PgIdent},
+    spec::{ComputeSpec, ExtVersion, PgIdent},
 };
 use serde::Deserialize;
 
@@ -16,6 +15,13 @@ pub struct ConfigurationRequest {
     pub spec: ComputeSpec,
 }
 
+#[derive(Deserialize, Debug)]
+pub struct ExtensionInstallRequest {
+    pub extension: PgIdent,
+    pub database: PgIdent,
+    pub version: ExtVersion,
+}
+
 #[derive(Deserialize, Debug)]
 pub struct SetRoleGrantsRequest {
     pub database: PgIdent,
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index fadf524273..79234be720 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize, Serializer};
 
 use crate::{
     privilege::Privilege,
-    spec::{ComputeSpec, Database, PgIdent, Role},
+    spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role},
 };
 
 #[derive(Serialize, Debug, Deserialize)]
@@ -172,6 +172,11 @@ pub struct InstalledExtensions {
     pub extensions: Vec<InstalledExtension>,
 }
 
+#[derive(Clone, Debug, Default, Serialize)]
+pub struct ExtensionInstallResult {
+    pub extension: PgIdent,
+    pub version: ExtVersion,
+}
 #[derive(Clone, Debug, Default, Serialize)]
 pub struct SetRoleGrantsResponse {
     pub database: PgIdent,
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 5903db7055..8a447563dc 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -16,6 +16,9 @@ use remote_storage::RemotePath;
 /// intended to be used for DB / role names.
 pub type PgIdent = String;
 
+/// String type alias representing Postgres extension version
+pub type ExtVersion = String;
+
 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
index e7b014b4a9..ea8291c1e0 100644
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -29,6 +29,16 @@ class EndpointHttpClient(requests.Session):
         res.raise_for_status()
         return res.json()
 
+    def extensions(self, extension: str, version: str, database: str):
+        body = {
+            "extension": extension,
+            "version": version,
+            "database": database,
+        }
+        res = self.post(f"http://localhost:{self.port}/extensions", json=body)
+        res.raise_for_status()
+        return res.json()
+
     def set_role_grants(self, database: str, role: str, schema: str, privileges: list[str]):
         res = self.post(
             f"http://localhost:{self.port}/grants",
diff --git a/test_runner/regress/test_extensions.py b/test_runner/regress/test_extensions.py
new file mode 100644
index 0000000000..100fd4b048
--- /dev/null
+++ b/test_runner/regress/test_extensions.py
@@ -0,0 +1,50 @@
+from logging import info
+
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_extensions(neon_simple_env: NeonEnv):
+    """basic test for the extensions endpoint testing installing extensions"""
+
+    env = neon_simple_env
+
+    env.create_branch("test_extensions")
+
+    endpoint = env.endpoints.create_start("test_extensions")
+    extension = "neon_test_utils"
+    database = "test_extensions"
+
+    endpoint.safe_psql("CREATE DATABASE test_extensions")
+
+    with endpoint.connect(dbname=database) as pg_conn:
+        with pg_conn.cursor() as cur:
+            cur.execute(
+                "SELECT default_version FROM pg_available_extensions WHERE name = 'neon_test_utils'"
+            )
+            res = cur.fetchone()
+            assert res is not None
+            version = res[0]
+
+        with pg_conn.cursor() as cur:
+            cur.execute(
+                "SELECT extname, extversion FROM pg_extension WHERE extname = 'neon_test_utils'",
+            )
+            res = cur.fetchone()
+            assert not res, "The 'neon_test_utils' extension is installed"
+
+    client = endpoint.http_client()
+    install_res = client.extensions(extension, version, database)
+
+    info("Extension install result: %s", res)
+    assert install_res["extension"] == extension and install_res["version"] == version
+
+    with endpoint.connect(dbname=database) as pg_conn:
+        with pg_conn.cursor() as cur:
+            cur.execute(
+                "SELECT extname, extversion FROM pg_extension WHERE extname = 'neon_test_utils'",
+            )
+            res = cur.fetchone()
+            assert res is not None
+            (db_extension_name, db_extension_version) = res
+
+    assert db_extension_name == extension and db_extension_version == version

From fecff15f18f00a692ff234106b064d1693cc5441 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Fri, 18 Oct 2024 15:31:50 +0300
Subject: [PATCH 53/57] walproposer: immediately exit if sync-safekeepers
 collected 0/0. (#9442)

Otherwise term history starting with 0/0 is streamed to safekeepers.

ref https://github.com/neondatabase/neon/issues/9434
---
 pgxn/neon/walproposer.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index a3f33cb261..d2a6104c74 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -841,6 +841,23 @@ HandleElectedProposer(WalProposer *wp)
 		wp_log(FATAL, "failed to download WAL for logical replicaiton");
 	}
 
+	/*
+	 * Zero propEpochStartLsn means majority of safekeepers doesn't have any
+	 * WAL, timeline was just created. Compute bumps it to basebackup LSN,
+	 * otherwise we must be sync-safekeepers and we have nothing to do then.
+	 *
+	 * Proceeding is not only pointless but harmful, because we'd give
+	 * safekeepers term history starting with 0/0. These hacks will go away once
+	 * we disable implicit timeline creation on safekeepers and create it with
+	 * non zero LSN from the start.
+	 */
+	if (wp->propEpochStartLsn == InvalidXLogRecPtr)
+	{
+		Assert(wp->config->syncSafekeepers);
+		wp_log(LOG, "elected with zero propEpochStartLsn in sync-safekeepers, exiting");
+		wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn);
+	}
+
 	if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers)
 	{
 		/* Sync is not needed: just exit */

From ec6d3422a5a7b6f537b029d7c3e70b7a60f99e0c Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 18 Oct 2024 13:38:59 +0100
Subject: [PATCH 54/57] pageserver: disconnect when asking client to reconnect
 (#9390)

## Problem

Consider the following sequence of events:
1. Shard location gets downgraded to secondary while there's a libpq
connection in pagestream mode from the compute
2. There's no active tenant, so we return `QueryError::Reconnect` from
`PageServerHandler::handle_get_page_at_lsn_request`.
3. Error bubbles up to `PostgresBackendIO::process_message`, bailing us
out of pagestream mode.
4. We instruct the client to reconnnect, but continue serving the libpq
connection. The client isn't yet aware of the request to reconnect and
believes it is still in pagestream mode. Pageserver fails to deserialize
get page requests wrapped in `CopyData` since it's not in pagestream
mode.

## Summary of Changes

When we wish to instruct the client to reconnect, also disconnect from
the server side after flushing the error.

Closes https://github.com/neondatabase/cloud/issues/17336
---
 libs/postgres_backend/src/lib.rs | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 9d274b25e6..7419798a60 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -738,6 +738,20 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                         QueryError::SimulatedConnectionError => {
                             return Err(QueryError::SimulatedConnectionError)
                         }
+                        err @ QueryError::Reconnect => {
+                            // Instruct the client to reconnect, stop processing messages
+                            // from this libpq connection and, finally, disconnect from the
+                            // server side (returning an Err achieves the later).
+                            //
+                            // Note the flushing is done by the caller.
+                            let reconnect_error = short_error(&err);
+                            self.write_message_noflush(&BeMessage::ErrorResponse(
+                                &reconnect_error,
+                                Some(err.pg_error_code()),
+                            ))?;
+
+                            return Err(err);
+                        }
                         e => {
                             log_query_error(query_string, &e);
                             let short_error = short_error(&e);

From 5cbdec9c794ef414e7511d644450b1a9a944d4ff Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 18 Oct 2024 14:41:21 +0100
Subject: [PATCH 55/57] [local_proxy]: install pg_session_jwt extension on
 demand (#9370)

Follow up on #9344. We want to install the extension automatically. We
didn't want to couple the extension into compute_ctl so instead
local_proxy is the one to issue requests specific to the extension.

depends on #9344 and #9395
---
 compute/Dockerfile.compute-node         |   4 +-
 proxy/src/auth/backend/local.rs         |  13 ++-
 proxy/src/bin/local_proxy.rs            |   8 +-
 proxy/src/compute_ctl/mod.rs            | 101 ++++++++++++++++++++++++
 proxy/src/http/mod.rs                   |  13 ++-
 proxy/src/lib.rs                        |   1 +
 proxy/src/serverless/backend.rs         |  54 +++++++++++--
 proxy/src/serverless/local_conn_pool.rs |  57 +++++++++----
 8 files changed, 222 insertions(+), 29 deletions(-)
 create mode 100644 proxy/src/compute_ctl/mod.rs

diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node
index 45c1fd9f38..74970696b5 100644
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -975,8 +975,8 @@ ARG PG_VERSION
 RUN case "${PG_VERSION}" in "v17") \
     echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
     esac && \
-    wget https://github.com/neondatabase/pg_session_jwt/archive/5aee2625af38213650e1a07ae038fdc427250ee4.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "5d91b10bc1347d36cffc456cb87bec25047935d6503dc652ca046f04760828e7 pg_session_jwt.tar.gz" | sha256sum --check && \
+    wget https://github.com/neondatabase/pg_session_jwt/archive/e642528f429dd3f5403845a50191b78d434b84a6.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "1a69210703cc91224785e59a0a67562dd9eed9a0914ac84b11447582ca0d5b93 pg_session_jwt.tar.gz" | sha256sum --check && \
     mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index e3995ac6c0..1e029ff609 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -1,23 +1,32 @@
 use std::net::SocketAddr;
 
 use arc_swap::ArcSwapOption;
+use tokio::sync::Semaphore;
 
 use super::jwt::{AuthRule, FetchAuthRules};
 use crate::auth::backend::jwt::FetchAuthRulesError;
 use crate::compute::ConnCfg;
+use crate::compute_ctl::ComputeCtlApi;
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo};
 use crate::control_plane::NodeInfo;
 use crate::intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag};
-use crate::EndpointId;
+use crate::url::ApiUrl;
+use crate::{http, EndpointId};
 
 pub struct LocalBackend {
+    pub(crate) initialize: Semaphore,
+    pub(crate) compute_ctl: ComputeCtlApi,
     pub(crate) node_info: NodeInfo,
 }
 
 impl LocalBackend {
-    pub fn new(postgres_addr: SocketAddr) -> Self {
+    pub fn new(postgres_addr: SocketAddr, compute_ctl: ApiUrl) -> Self {
         LocalBackend {
+            initialize: Semaphore::new(1),
+            compute_ctl: ComputeCtlApi {
+                api: http::Endpoint::new(compute_ctl, http::new_client()),
+            },
             node_info: NodeInfo {
                 config: {
                     let mut cfg = ConnCfg::new();
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index e6bc369d9a..a16c288e5d 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -25,6 +25,7 @@ use proxy::rate_limiter::{
 use proxy::scram::threadpool::ThreadPool;
 use proxy::serverless::cancel_set::CancelSet;
 use proxy::serverless::{self, GlobalConnPoolOptions};
+use proxy::url::ApiUrl;
 use proxy::RoleName;
 
 project_git_version!(GIT_VERSION);
@@ -80,7 +81,10 @@ struct LocalProxyCliArgs {
     connect_to_compute_retry: String,
     /// Address of the postgres server
     #[clap(long, default_value = "127.0.0.1:5432")]
-    compute: SocketAddr,
+    postgres: SocketAddr,
+    /// Address of the compute-ctl api service
+    #[clap(long, default_value = "http://127.0.0.1:3080/")]
+    compute_ctl: ApiUrl,
     /// Path of the local proxy config file
     #[clap(long, default_value = "./local_proxy.json")]
     config_path: Utf8PathBuf,
@@ -295,7 +299,7 @@ fn build_auth_backend(
     args: &LocalProxyCliArgs,
 ) -> anyhow::Result<&'static auth::Backend<'static, ()>> {
     let auth_backend = proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned(
-        LocalBackend::new(args.compute),
+        LocalBackend::new(args.postgres, args.compute_ctl.clone()),
     ));
 
     Ok(Box::leak(Box::new(auth_backend)))
diff --git a/proxy/src/compute_ctl/mod.rs b/proxy/src/compute_ctl/mod.rs
new file mode 100644
index 0000000000..2b57897223
--- /dev/null
+++ b/proxy/src/compute_ctl/mod.rs
@@ -0,0 +1,101 @@
+use compute_api::responses::GenericAPIError;
+use hyper::{Method, StatusCode};
+use serde::de::DeserializeOwned;
+use serde::{Deserialize, Serialize};
+use thiserror::Error;
+
+use crate::url::ApiUrl;
+use crate::{http, DbName, RoleName};
+
+pub struct ComputeCtlApi {
+    pub(crate) api: http::Endpoint,
+}
+
+#[derive(Serialize, Debug)]
+pub struct ExtensionInstallRequest {
+    pub extension: &'static str,
+    pub database: DbName,
+    pub version: &'static str,
+}
+
+#[derive(Serialize, Debug)]
+pub struct SetRoleGrantsRequest {
+    pub database: DbName,
+    pub schema: &'static str,
+    pub privileges: Vec<Privilege>,
+    pub role: RoleName,
+}
+
+#[derive(Clone, Debug, Deserialize)]
+pub struct ExtensionInstallResponse {}
+
+#[derive(Clone, Debug, Deserialize)]
+pub struct SetRoleGrantsResponse {}
+
+#[derive(Debug, Serialize, Deserialize, Clone, Copy)]
+#[serde(rename_all = "UPPERCASE")]
+pub enum Privilege {
+    Usage,
+}
+
+#[derive(Error, Debug)]
+pub enum ComputeCtlError {
+    #[error("connection error: {0}")]
+    ConnectionError(#[source] reqwest_middleware::Error),
+    #[error("request error [{status}]: {body:?}")]
+    RequestError {
+        status: StatusCode,
+        body: Option<GenericAPIError>,
+    },
+    #[error("response parsing error: {0}")]
+    ResponseError(#[source] reqwest::Error),
+}
+
+impl ComputeCtlApi {
+    pub async fn install_extension(
+        &self,
+        req: &ExtensionInstallRequest,
+    ) -> Result<ExtensionInstallResponse, ComputeCtlError> {
+        self.generic_request(req, Method::POST, |url| {
+            url.path_segments_mut().push("extensions");
+        })
+        .await
+    }
+
+    pub async fn grant_role(
+        &self,
+        req: &SetRoleGrantsRequest,
+    ) -> Result<SetRoleGrantsResponse, ComputeCtlError> {
+        self.generic_request(req, Method::POST, |url| {
+            url.path_segments_mut().push("grants");
+        })
+        .await
+    }
+
+    async fn generic_request<Req, Resp>(
+        &self,
+        req: &Req,
+        method: Method,
+        url: impl for<'a> FnOnce(&'a mut ApiUrl),
+    ) -> Result<Resp, ComputeCtlError>
+    where
+        Req: Serialize,
+        Resp: DeserializeOwned,
+    {
+        let resp = self
+            .api
+            .request_with_url(method, url)
+            .json(req)
+            .send()
+            .await
+            .map_err(ComputeCtlError::ConnectionError)?;
+
+        let status = resp.status();
+        if status.is_client_error() || status.is_server_error() {
+            let body = resp.json().await.ok();
+            return Err(ComputeCtlError::RequestError { status, body });
+        }
+
+        resp.json().await.map_err(ComputeCtlError::ResponseError)
+    }
+}
diff --git a/proxy/src/http/mod.rs b/proxy/src/http/mod.rs
index fd587e8f01..f1b632e704 100644
--- a/proxy/src/http/mod.rs
+++ b/proxy/src/http/mod.rs
@@ -8,6 +8,7 @@ use std::time::Duration;
 
 use anyhow::bail;
 use bytes::Bytes;
+use http::Method;
 use http_body_util::BodyExt;
 use hyper::body::Body;
 pub(crate) use reqwest::{Request, Response};
@@ -93,9 +94,19 @@ impl Endpoint {
     /// Return a [builder](RequestBuilder) for a `GET` request,
     /// accepting a closure to modify the url path segments for more complex paths queries.
     pub(crate) fn get_with_url(&self, f: impl for<'a> FnOnce(&'a mut ApiUrl)) -> RequestBuilder {
+        self.request_with_url(Method::GET, f)
+    }
+
+    /// Return a [builder](RequestBuilder) for a request,
+    /// accepting a closure to modify the url path segments for more complex paths queries.
+    pub(crate) fn request_with_url(
+        &self,
+        method: Method,
+        f: impl for<'a> FnOnce(&'a mut ApiUrl),
+    ) -> RequestBuilder {
         let mut url = self.endpoint.clone();
         f(&mut url);
-        self.client.get(url.into_inner())
+        self.client.request(method, url.into_inner())
     }
 
     /// Execute a [request](reqwest::Request).
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index a7b3d45c95..ea17a88067 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -90,6 +90,7 @@ pub mod auth;
 pub mod cache;
 pub mod cancellation;
 pub mod compute;
+pub mod compute_ctl;
 pub mod config;
 pub mod console_redirect_proxy;
 pub mod context;
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 82e81dbcfe..5d59b4d252 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -14,10 +14,13 @@ use tracing::{debug, info};
 use super::conn_pool::poll_client;
 use super::conn_pool_lib::{Client, ConnInfo, GlobalConnPool};
 use super::http_conn_pool::{self, poll_http2_client, Send};
-use super::local_conn_pool::{self, LocalClient, LocalConnPool};
+use super::local_conn_pool::{self, LocalClient, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION};
 use crate::auth::backend::local::StaticAuthRules;
 use crate::auth::backend::{ComputeCredentials, ComputeUserInfo};
 use crate::auth::{self, check_peer_addr_is_in_list, AuthError};
+use crate::compute_ctl::{
+    ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest,
+};
 use crate::config::ProxyConfig;
 use crate::context::RequestMonitoring;
 use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError};
@@ -35,6 +38,7 @@ pub(crate) struct PoolingBackend {
     pub(crate) http_conn_pool: Arc<super::http_conn_pool::GlobalConnPool<Send>>,
     pub(crate) local_pool: Arc<LocalConnPool<tokio_postgres::Client>>,
     pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
+
     pub(crate) config: &'static ProxyConfig,
     pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>,
     pub(crate) endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -250,16 +254,47 @@ impl PoolingBackend {
             return Ok(client);
         }
 
+        let local_backend = match &self.auth_backend {
+            auth::Backend::ControlPlane(_, ()) => {
+                unreachable!("only local_proxy can connect to local postgres")
+            }
+            auth::Backend::Local(local) => local,
+        };
+
+        if !self.local_pool.initialized(&conn_info) {
+            // only install and grant usage one at a time.
+            let _permit = local_backend.initialize.acquire().await.unwrap();
+
+            // check again for race
+            if !self.local_pool.initialized(&conn_info) {
+                local_backend
+                    .compute_ctl
+                    .install_extension(&ExtensionInstallRequest {
+                        extension: EXT_NAME,
+                        database: conn_info.dbname.clone(),
+                        version: EXT_VERSION,
+                    })
+                    .await?;
+
+                local_backend
+                    .compute_ctl
+                    .grant_role(&SetRoleGrantsRequest {
+                        schema: EXT_SCHEMA,
+                        privileges: vec![Privilege::Usage],
+                        database: conn_info.dbname.clone(),
+                        role: conn_info.user_info.user.clone(),
+                    })
+                    .await?;
+
+                self.local_pool.set_initialized(&conn_info);
+            }
+        }
+
         let conn_id = uuid::Uuid::new_v4();
         tracing::Span::current().record("conn_id", display(conn_id));
         info!(%conn_id, "local_pool: opening a new connection '{conn_info}'");
 
-        let mut node_info = match &self.auth_backend {
-            auth::Backend::ControlPlane(_, ()) => {
-                unreachable!("only local_proxy can connect to local postgres")
-            }
-            auth::Backend::Local(local) => local.node_info.clone(),
-        };
+        let mut node_info = local_backend.node_info.clone();
 
         let (key, jwk) = create_random_jwk();
 
@@ -324,6 +359,8 @@ pub(crate) enum HttpConnError {
     #[error("could not parse JWT payload")]
     JwtPayloadError(serde_json::Error),
 
+    #[error("could not install extension: {0}")]
+    ComputeCtl(#[from] ComputeCtlError),
     #[error("could not get auth info")]
     GetAuthInfo(#[from] GetAuthInfoError),
     #[error("user not authenticated")]
@@ -348,6 +385,7 @@ impl ReportableError for HttpConnError {
             HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute,
             HttpConnError::PostgresConnectionError(p) => p.get_error_kind(),
             HttpConnError::LocalProxyConnectionError(_) => ErrorKind::Compute,
+            HttpConnError::ComputeCtl(_) => ErrorKind::Service,
             HttpConnError::JwtPayloadError(_) => ErrorKind::User,
             HttpConnError::GetAuthInfo(a) => a.get_error_kind(),
             HttpConnError::AuthError(a) => a.get_error_kind(),
@@ -363,6 +401,7 @@ impl UserFacingError for HttpConnError {
             HttpConnError::ConnectionClosedAbruptly(_) => self.to_string(),
             HttpConnError::PostgresConnectionError(p) => p.to_string(),
             HttpConnError::LocalProxyConnectionError(p) => p.to_string(),
+            HttpConnError::ComputeCtl(_) => "could not set up the JWT authorization database extension".to_string(),
             HttpConnError::JwtPayloadError(p) => p.to_string(),
             HttpConnError::GetAuthInfo(c) => c.to_string_client(),
             HttpConnError::AuthError(c) => c.to_string_client(),
@@ -379,6 +418,7 @@ impl CouldRetry for HttpConnError {
         match self {
             HttpConnError::PostgresConnectionError(e) => e.could_retry(),
             HttpConnError::LocalProxyConnectionError(e) => e.could_retry(),
+            HttpConnError::ComputeCtl(_) => false,
             HttpConnError::ConnectionClosedAbruptly(_) => false,
             HttpConnError::JwtPayloadError(_) => false,
             HttpConnError::GetAuthInfo(_) => false,
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index a01afd2820..beb2ad4e8f 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -1,3 +1,14 @@
+//! Manages the pool of connections between local_proxy and postgres.
+//!
+//! The pool is keyed by database and role_name, and can contain multiple connections
+//! shared between users.
+//!
+//! The pool manages the pg_session_jwt extension used for authorizing
+//! requests in the db.
+//!
+//! The first time a db/role pair is seen, local_proxy attempts to install the extension
+//! and grant usage to the role on the given schema.
+
 use std::collections::HashMap;
 use std::pin::pin;
 use std::sync::{Arc, Weak};
@@ -27,14 +38,15 @@ use crate::metrics::Metrics;
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
 use crate::{DbName, RoleName};
 
+pub(crate) const EXT_NAME: &str = "pg_session_jwt";
+pub(crate) const EXT_VERSION: &str = "0.1.1";
+pub(crate) const EXT_SCHEMA: &str = "auth";
+
 struct ConnPoolEntry<C: ClientInnerExt> {
     conn: ClientInner<C>,
     _last_access: std::time::Instant,
 }
 
-// /// key id for the pg_session_jwt state
-// static PG_SESSION_JWT_KID: AtomicU64 = AtomicU64::new(1);
-
 // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
 pub(crate) struct EndpointConnPool<C: ClientInnerExt> {
@@ -140,11 +152,18 @@ impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
 
 pub(crate) struct DbUserConnPool<C: ClientInnerExt> {
     conns: Vec<ConnPoolEntry<C>>,
+
+    // true if we have definitely installed the extension and
+    // granted the role access to the auth schema.
+    initialized: bool,
 }
 
 impl<C: ClientInnerExt> Default for DbUserConnPool<C> {
     fn default() -> Self {
-        Self { conns: Vec::new() }
+        Self {
+            conns: Vec::new(),
+            initialized: false,
+        }
     }
 }
 
@@ -199,25 +218,16 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
         self.config.pool_options.idle_timeout
     }
 
-    // pub(crate) fn shutdown(&self) {
-    //     let mut pool = self.global_pool.write();
-    //     pool.pools.clear();
-    //     pool.total_conns = 0;
-    // }
-
     pub(crate) fn get(
         self: &Arc<Self>,
         ctx: &RequestMonitoring,
         conn_info: &ConnInfo,
     ) -> Result<Option<LocalClient<C>>, HttpConnError> {
-        let mut client: Option<ClientInner<C>> = None;
-        if let Some(entry) = self
+        let client = self
             .global_pool
             .write()
             .get_conn_entry(conn_info.db_and_user())
-        {
-            client = Some(entry.conn);
-        }
+            .map(|entry| entry.conn);
 
         // ok return cached connection if found and establish a new one otherwise
         if let Some(client) = client {
@@ -245,6 +255,23 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
         }
         Ok(None)
     }
+
+    pub(crate) fn initialized(self: &Arc<Self>, conn_info: &ConnInfo) -> bool {
+        self.global_pool
+            .read()
+            .pools
+            .get(&conn_info.db_and_user())
+            .map_or(false, |pool| pool.initialized)
+    }
+
+    pub(crate) fn set_initialized(self: &Arc<Self>, conn_info: &ConnInfo) {
+        self.global_pool
+            .write()
+            .pools
+            .entry(conn_info.db_and_user())
+            .or_default()
+            .initialized = true;
+    }
 }
 
 #[allow(clippy::too_many_arguments)]

From e162ab8b536e8b1d2277b4e2c00abd574c394d75 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 18 Oct 2024 15:33:04 +0100
Subject: [PATCH 56/57] storcon: handle ongoing deletions gracefully (#9449)

## Problem

Pageserver returns 409 (Conflict) if any of the shards are already
deleting the timeline. This resulted in an error being propagated out of
the HTTP handler and to the client. It's an expected scenario so we
should handle it nicely.

This caused failures in `test_storage_controller_smoke`
[here](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9435/11390431900/index.html#suites/8fc5d1648d2225380766afde7c428d81/86eee4b002d6572d).

## Summary of Changes

Instead of returning an error on 409s, we now bubble the status code up
and let the HTTP handler code retry until it gets a 404 or times out.
---
 storage_controller/src/http.rs    | 18 ++++++++++++------
 storage_controller/src/service.rs | 29 +++++++++++++++++++++--------
 2 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 46b6f4f2bf..afefe8598c 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -381,14 +381,16 @@ async fn handle_tenant_timeline_delete(
         R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
         F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
     {
+        // On subsequent retries, wait longer.
+        // Enable callers with a 25 second request timeout to reliably get a response
+        const MAX_WAIT: Duration = Duration::from_secs(25);
+        const MAX_RETRY_PERIOD: Duration = Duration::from_secs(5);
+
         let started_at = Instant::now();
+
         // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
         // completed.
         let mut retry_period = Duration::from_secs(1);
-        // On subsequent retries, wait longer.
-        let max_retry_period = Duration::from_secs(5);
-        // Enable callers with a 30 second request timeout to reliably get a response
-        let max_wait = Duration::from_secs(25);
 
         loop {
             let status = f(service.clone()).await?;
@@ -396,7 +398,11 @@ async fn handle_tenant_timeline_delete(
                 StatusCode::ACCEPTED => {
                     tracing::info!("Deletion accepted, waiting to try again...");
                     tokio::time::sleep(retry_period).await;
-                    retry_period = max_retry_period;
+                    retry_period = MAX_RETRY_PERIOD;
+                }
+                StatusCode::CONFLICT => {
+                    tracing::info!("Deletion already in progress, waiting to try again...");
+                    tokio::time::sleep(retry_period).await;
                 }
                 StatusCode::NOT_FOUND => {
                     tracing::info!("Deletion complete");
@@ -409,7 +415,7 @@ async fn handle_tenant_timeline_delete(
             }
 
             let now = Instant::now();
-            if now + retry_period > started_at + max_wait {
+            if now + retry_period > started_at + MAX_WAIT {
                 tracing::info!("Deletion timed out waiting for 404");
                 // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
                 // the pageserver's swagger definition for this endpoint, and has the same desired
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ab2c3b5e48..01aa8f1dab 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3630,14 +3630,21 @@ impl Service {
                 );
 
                 let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
-                client
+                let res = client
                     .timeline_delete(tenant_shard_id, timeline_id)
-                    .await
-                    .map_err(|e| {
-                        ApiError::InternalServerError(anyhow::anyhow!(
-                            "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
-                        ))
-                    })
+                    .await;
+
+                match res {
+                    Ok(ok) => Ok(ok),
+                    Err(mgmt_api::Error::ApiError(StatusCode::CONFLICT, _)) => Ok(StatusCode::CONFLICT),
+                    Err(e) => {
+                        Err(
+                            ApiError::InternalServerError(anyhow::anyhow!(
+                                "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
+                            ))
+                        )
+                    }
+                }
             }
 
             let locations = targets.0.iter().map(|t| (*t.0, t.1.latest.node.clone())).collect();
@@ -3652,7 +3659,13 @@ impl Service {
                 })
                 .await?;
 
-            // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero
+            // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero.
+            // We return 409 (Conflict) if deletion was already in progress on any of the shards
+            // and 202 (Accepted) if deletion was not already in progress on any of the shards.
+            if statuses.iter().any(|s| s == &StatusCode::CONFLICT) {
+                return Ok(StatusCode::CONFLICT);
+            }
+
             if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) {
                 return Ok(StatusCode::ACCEPTED);
             }

From 62a334871fef32b754ab98a772ebbbbed8d1aa1c Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 18 Oct 2024 09:36:29 -0500
Subject: [PATCH 57/57] Take the collector name as argument when generating
 sql_exporter configs

In neon_collector_autoscaling.jsonnet, the collector name is hardcoded
to neon_collector_autoscaling. This issue manifests itself such that
sql_exporter would not find the collector configuration.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/Makefile                 | 2 ++
 compute/etc/sql_exporter.jsonnet | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/compute/Makefile b/compute/Makefile
index e4f08a223c..e2896fe390 100644
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -20,12 +20,14 @@ neon_collector_autoscaling.yml: $(jsonnet_files)
 sql_exporter.yml: $(jsonnet_files)
 	JSONNET_PATH=etc jsonnet \
 		--output-file etc/$@ \
+		--tla-str collector_name=neon_collector \
 		--tla-str collector_file=neon_collector.yml \
 		etc/sql_exporter.jsonnet
 
 sql_exporter_autoscaling.yml: $(jsonnet_files)
 	JSONNET_PATH=etc jsonnet \
 		--output-file etc/$@ \
+		--tla-str collector_name=neon_collector_autoscaling \
 		--tla-str collector_file=neon_collector_autoscaling.yml \
 		--tla-str application_name=sql_exporter_autoscaling \
 		etc/sql_exporter.jsonnet
diff --git a/compute/etc/sql_exporter.jsonnet b/compute/etc/sql_exporter.jsonnet
index 640e2ac38d..3c36fd4f68 100644
--- a/compute/etc/sql_exporter.jsonnet
+++ b/compute/etc/sql_exporter.jsonnet
@@ -1,4 +1,4 @@
-function(collector_file, application_name='sql_exporter') {
+function(collector_name, collector_file, application_name='sql_exporter') {
   // Configuration for sql_exporter for autoscaling-agent
   // Global defaults.
   global: {
@@ -28,7 +28,7 @@ function(collector_file, application_name='sql_exporter') {
     // Collectors (referenced by name) to execute on the target.
     // Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
     collectors: [
-      'neon_collector',
+      collector_name,
     ],
   },