From f246aa3ca7f19993e0582dfd8069375c09c5158c Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 19 Aug 2024 10:33:46 +0200
Subject: [PATCH 01/55] proxy: Fix some warnings by extended clippy checks
 (#8748)

* Missing blank lifetimes which is now deprecated.
* Matching off unqualified enum variants that could act like variable.
* Missing semicolons.
---
 proxy/src/auth.rs                             |  46 +++--
 proxy/src/auth/backend.rs                     |  72 +++-----
 proxy/src/auth/backend/jwt.rs                 |   6 +-
 proxy/src/auth/credentials.rs                 |  17 +-
 proxy/src/cache/common.rs                     |   2 +-
 proxy/src/cache/timed_lru.rs                  |   2 +-
 proxy/src/compute.rs                          |  23 ++-
 proxy/src/config.rs                           |   6 +-
 proxy/src/console/messages.rs                 |  22 ++-
 proxy/src/console/mgmt.rs                     |   3 +-
 proxy/src/console/provider.rs                 | 170 +++++++++---------
 proxy/src/context.rs                          |   8 +-
 proxy/src/context/parquet.rs                  |   2 +-
 proxy/src/intern.rs                           |   2 +-
 proxy/src/metrics.rs                          |   2 +-
 proxy/src/proxy/copy_bidirectional.rs         |   2 +-
 proxy/src/proxy/handshake.rs                  |  15 +-
 proxy/src/proxy/tests/mitm.rs                 |   4 +-
 proxy/src/rate_limiter/limit_algorithm.rs     |   4 +-
 .../src/rate_limiter/limit_algorithm/aimd.rs  |   5 +-
 .../connection_with_credentials_provider.rs   |   2 +-
 proxy/src/redis/notifications.rs              |  23 ++-
 proxy/src/sasl.rs                             |   5 +-
 proxy/src/sasl/channel_binding.rs             |  21 +--
 proxy/src/sasl/messages.rs                    |   5 +-
 proxy/src/scram.rs                            |   4 +-
 proxy/src/scram/countmin.rs                   |   2 -
 proxy/src/scram/exchange.rs                   |  22 +--
 proxy/src/scram/messages.rs                   |   6 +-
 proxy/src/scram/pbkdf2.rs                     |   2 +-
 proxy/src/scram/threadpool.rs                 |   4 +-
 proxy/src/serverless.rs                       |   4 +-
 proxy/src/serverless/conn_pool.rs             |   6 +-
 proxy/src/stream.rs                           |   2 +-
 proxy/src/url.rs                              |   2 +-
 proxy/src/waiters.rs                          |   2 +-
 36 files changed, 246 insertions(+), 279 deletions(-)
diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index 8c44823c98..3b3c571129 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -113,38 +113,36 @@ impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
 
 impl UserFacingError for AuthError {
     fn to_string_client(&self) -> String {
-        use AuthErrorImpl::*;
         match self.0.as_ref() {
-            Link(e) => e.to_string_client(),
-            GetAuthInfo(e) => e.to_string_client(),
-            Sasl(e) => e.to_string_client(),
-            AuthFailed(_) => self.to_string(),
-            BadAuthMethod(_) => self.to_string(),
-            MalformedPassword(_) => self.to_string(),
-            MissingEndpointName => self.to_string(),
-            Io(_) => "Internal error".to_string(),
-            IpAddressNotAllowed(_) => self.to_string(),
-            TooManyConnections => self.to_string(),
-            UserTimeout(_) => self.to_string(),
+            AuthErrorImpl::Link(e) => e.to_string_client(),
+            AuthErrorImpl::GetAuthInfo(e) => e.to_string_client(),
+            AuthErrorImpl::Sasl(e) => e.to_string_client(),
+            AuthErrorImpl::AuthFailed(_) => self.to_string(),
+            AuthErrorImpl::BadAuthMethod(_) => self.to_string(),
+            AuthErrorImpl::MalformedPassword(_) => self.to_string(),
+            AuthErrorImpl::MissingEndpointName => self.to_string(),
+            AuthErrorImpl::Io(_) => "Internal error".to_string(),
+            AuthErrorImpl::IpAddressNotAllowed(_) => self.to_string(),
+            AuthErrorImpl::TooManyConnections => self.to_string(),
+            AuthErrorImpl::UserTimeout(_) => self.to_string(),
         }
     }
 }
 
 impl ReportableError for AuthError {
     fn get_error_kind(&self) -> crate::error::ErrorKind {
-        use AuthErrorImpl::*;
         match self.0.as_ref() {
-            Link(e) => e.get_error_kind(),
-            GetAuthInfo(e) => e.get_error_kind(),
-            Sasl(e) => e.get_error_kind(),
-            AuthFailed(_) => crate::error::ErrorKind::User,
-            BadAuthMethod(_) => crate::error::ErrorKind::User,
-            MalformedPassword(_) => crate::error::ErrorKind::User,
-            MissingEndpointName => crate::error::ErrorKind::User,
-            Io(_) => crate::error::ErrorKind::ClientDisconnect,
-            IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
-            TooManyConnections => crate::error::ErrorKind::RateLimit,
-            UserTimeout(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::Link(e) => e.get_error_kind(),
+            AuthErrorImpl::GetAuthInfo(e) => e.get_error_kind(),
+            AuthErrorImpl::Sasl(e) => e.get_error_kind(),
+            AuthErrorImpl::AuthFailed(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::BadAuthMethod(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::MalformedPassword(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::MissingEndpointName => crate::error::ErrorKind::User,
+            AuthErrorImpl::Io(_) => crate::error::ErrorKind::ClientDisconnect,
+            AuthErrorImpl::IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::TooManyConnections => crate::error::ErrorKind::RateLimit,
+            AuthErrorImpl::UserTimeout(_) => crate::error::ErrorKind::User,
         }
     }
 }
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index c6a0b2af5a..7592d076ec 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -80,9 +80,8 @@ pub trait TestBackend: Send + Sync + 'static {
 
 impl std::fmt::Display for BackendType<'_, (), ()> {
     fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        use BackendType::*;
         match self {
-            Console(api, _) => match &**api {
+            Self::Console(api, _) => match &**api {
                 ConsoleBackend::Console(endpoint) => {
                     fmt.debug_tuple("Console").field(&endpoint.url()).finish()
                 }
@@ -93,7 +92,7 @@ impl std::fmt::Display for BackendType<'_, (), ()> {
                 #[cfg(test)]
                 ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
             },
-            Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            Self::Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
         }
     }
 }
@@ -102,10 +101,9 @@ impl<T, D> BackendType<'_, T, D> {
     /// Very similar to [`std::option::Option::as_ref`].
     /// This helps us pass structured config to async tasks.
     pub fn as_ref(&self) -> BackendType<'_, &T, &D> {
-        use BackendType::*;
         match self {
-            Console(c, x) => Console(MaybeOwned::Borrowed(c), x),
-            Link(c, x) => Link(MaybeOwned::Borrowed(c), x),
+            Self::Console(c, x) => BackendType::Console(MaybeOwned::Borrowed(c), x),
+            Self::Link(c, x) => BackendType::Link(MaybeOwned::Borrowed(c), x),
         }
     }
 }
@@ -115,10 +113,9 @@ impl<'a, T, D> BackendType<'a, T, D> {
     /// Maps [`BackendType<T>`] to [`BackendType<R>`] by applying
     /// a function to a contained value.
     pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> {
-        use BackendType::*;
         match self {
-            Console(c, x) => Console(c, f(x)),
-            Link(c, x) => Link(c, x),
+            Self::Console(c, x) => BackendType::Console(c, f(x)),
+            Self::Link(c, x) => BackendType::Link(c, x),
         }
     }
 }
@@ -126,10 +123,9 @@ impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
     /// Very similar to [`std::option::Option::transpose`].
     /// This is most useful for error handling.
     pub fn transpose(self) -> Result<BackendType<'a, T, D>, E> {
-        use BackendType::*;
         match self {
-            Console(c, x) => x.map(|x| Console(c, x)),
-            Link(c, x) => Ok(Link(c, x)),
+            Self::Console(c, x) => x.map(|x| BackendType::Console(c, x)),
+            Self::Link(c, x) => Ok(BackendType::Link(c, x)),
         }
     }
 }
@@ -293,7 +289,9 @@ async fn auth_quirks(
             ctx.set_endpoint_id(res.info.endpoint.clone());
             let password = match res.keys {
                 ComputeCredentialKeys::Password(p) => p,
-                _ => unreachable!("password hack should return a password"),
+                ComputeCredentialKeys::AuthKeys(_) => {
+                    unreachable!("password hack should return a password")
+                }
             };
             (res.info, Some(password))
         }
@@ -400,21 +398,17 @@ async fn authenticate_with_secret(
 impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
     /// Get compute endpoint name from the credentials.
     pub fn get_endpoint(&self) -> Option<EndpointId> {
-        use BackendType::*;
-
         match self {
-            Console(_, user_info) => user_info.endpoint_id.clone(),
-            Link(_, _) => Some("link".into()),
+            Self::Console(_, user_info) => user_info.endpoint_id.clone(),
+            Self::Link(_, _) => Some("link".into()),
         }
     }
 
     /// Get username from the credentials.
     pub fn get_user(&self) -> &str {
-        use BackendType::*;
-
         match self {
-            Console(_, user_info) => &user_info.user,
-            Link(_, _) => "link",
+            Self::Console(_, user_info) => &user_info.user,
+            Self::Link(_, _) => "link",
         }
     }
 
@@ -428,10 +422,8 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
         config: &'static AuthenticationConfig,
         endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     ) -> auth::Result<BackendType<'a, ComputeCredentials, NodeInfo>> {
-        use BackendType::*;
-
         let res = match self {
-            Console(api, user_info) => {
+            Self::Console(api, user_info) => {
                 info!(
                     user = &*user_info.user,
                     project = user_info.endpoint(),
@@ -451,7 +443,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
                 BackendType::Console(api, credentials)
             }
             // NOTE: this auth backend doesn't use client credentials.
-            Link(url, _) => {
+            Self::Link(url, _) => {
                 info!("performing link authentication");
 
                 let info = link::authenticate(ctx, &url, client).await?;
@@ -470,10 +462,9 @@ impl BackendType<'_, ComputeUserInfo, &()> {
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
-        use BackendType::*;
         match self {
-            Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
-            Link(_, _) => Ok(Cached::new_uncached(None)),
+            Self::Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
+            Self::Link(_, _) => Ok(Cached::new_uncached(None)),
         }
     }
 
@@ -481,10 +472,9 @@ impl BackendType<'_, ComputeUserInfo, &()> {
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
-        use BackendType::*;
         match self {
-            Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+            Self::Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
         }
     }
 }
@@ -495,18 +485,16 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
-        use BackendType::*;
-
         match self {
-            Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Link(_, info) => Ok(Cached::new_uncached(info.clone())),
+            Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
+            Self::Link(_, info) => Ok(Cached::new_uncached(info.clone())),
         }
     }
 
     fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
         match self {
-            BackendType::Console(_, creds) => Some(&creds.keys),
-            BackendType::Link(_, _) => None,
+            Self::Console(_, creds) => Some(&creds.keys),
+            Self::Link(_, _) => None,
         }
     }
 }
@@ -517,18 +505,16 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
         &self,
         ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
-        use BackendType::*;
-
         match self {
-            Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
+            Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
+            Self::Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
         }
     }
 
     fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
         match self {
-            BackendType::Console(_, creds) => Some(&creds.keys),
-            BackendType::Link(_, _) => None,
+            Self::Console(_, creds) => Some(&creds.keys),
+            Self::Link(_, _) => None,
         }
     }
 }
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 0c2ca8fb97..e021a7e23f 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -195,7 +195,7 @@ impl JwkCacheEntryLock {
 
         let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
             .context("Provided authentication token is not a valid JWT encoding")?;
-        let header = serde_json::from_slice::<JWTHeader>(&header)
+        let header = serde_json::from_slice::<JWTHeader<'_>>(&header)
             .context("Provided authentication token is not a valid JWT encoding")?;
 
         let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
@@ -340,7 +340,7 @@ impl JwkRenewalPermit<'_> {
         }
     }
 
-    async fn acquire_permit(from: &Arc<JwkCacheEntryLock>) -> JwkRenewalPermit {
+    async fn acquire_permit(from: &Arc<JwkCacheEntryLock>) -> JwkRenewalPermit<'_> {
         match from.lookup.acquire().await {
             Ok(permit) => {
                 permit.forget();
@@ -352,7 +352,7 @@ impl JwkRenewalPermit<'_> {
         }
     }
 
-    fn try_acquire_permit(from: &Arc<JwkCacheEntryLock>) -> Option<JwkRenewalPermit> {
+    fn try_acquire_permit(from: &Arc<JwkCacheEntryLock>) -> Option<JwkRenewalPermit<'_>> {
         match from.lookup.try_acquire() {
             Ok(permit) => {
                 permit.forget();
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 8f4a392131..849e7d65e8 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -89,10 +89,12 @@ impl ComputeUserInfoMaybeEndpoint {
         sni: Option<&str>,
         common_names: Option<&HashSet<String>>,
     ) -> Result<Self, ComputeUserInfoParseError> {
-        use ComputeUserInfoParseError::*;
-
         // Some parameters are stored in the startup message.
-        let get_param = |key| params.get(key).ok_or(MissingKey(key));
+        let get_param = |key| {
+            params
+                .get(key)
+                .ok_or(ComputeUserInfoParseError::MissingKey(key))
+        };
         let user: RoleName = get_param("user")?.into();
 
         // Project name might be passed via PG's command-line options.
@@ -122,11 +124,14 @@ impl ComputeUserInfoMaybeEndpoint {
         let endpoint = match (endpoint_option, endpoint_from_domain) {
             // Invariant: if we have both project name variants, they should match.
             (Some(option), Some(domain)) if option != domain => {
-                Some(Err(InconsistentProjectNames { domain, option }))
+                Some(Err(ComputeUserInfoParseError::InconsistentProjectNames {
+                    domain,
+                    option,
+                }))
             }
             // Invariant: project name may not contain certain characters.
             (a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) {
-                false => Err(MalformedProjectName(name)),
+                false => Err(ComputeUserInfoParseError::MalformedProjectName(name)),
                 true => Ok(name),
             }),
         }
@@ -186,7 +191,7 @@ impl<'de> serde::de::Deserialize<'de> for IpPattern {
         impl<'de> serde::de::Visitor<'de> for StrVisitor {
             type Value = IpPattern;
 
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
                 write!(formatter, "comma separated list with ip address, ip address range, or ip address subnet mask")
             }
 
diff --git a/proxy/src/cache/common.rs b/proxy/src/cache/common.rs
index 4e393fddb2..82c78e3eb2 100644
--- a/proxy/src/cache/common.rs
+++ b/proxy/src/cache/common.rs
@@ -24,7 +24,7 @@ impl<C: Cache> Cache for &C {
     type LookupInfo<Key> = C::LookupInfo<Key>;
 
     fn invalidate(&self, info: &Self::LookupInfo<Self::Key>) {
-        C::invalidate(self, info)
+        C::invalidate(self, info);
     }
 }
 
diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs
index c5c4f6a1ed..07fad56643 100644
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -58,7 +58,7 @@ impl<K: Hash + Eq, V> Cache for TimedLru<K, V> {
     type LookupInfo<Key> = LookupInfo<Key>;
 
     fn invalidate(&self, info: &Self::LookupInfo<K>) {
-        self.invalidate_raw(info)
+        self.invalidate_raw(info);
     }
 }
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 18c82fe379..c071a59d58 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -44,11 +44,10 @@ pub enum ConnectionError {
 
 impl UserFacingError for ConnectionError {
     fn to_string_client(&self) -> String {
-        use ConnectionError::*;
         match self {
             // This helps us drop irrelevant library-specific prefixes.
             // TODO: propagate severity level and other parameters.
-            Postgres(err) => match err.as_db_error() {
+            ConnectionError::Postgres(err) => match err.as_db_error() {
                 Some(err) => {
                     let msg = err.message();
 
@@ -62,8 +61,8 @@ impl UserFacingError for ConnectionError {
                 }
                 None => err.to_string(),
             },
-            WakeComputeError(err) => err.to_string_client(),
-            TooManyConnectionAttempts(_) => {
+            ConnectionError::WakeComputeError(err) => err.to_string_client(),
+            ConnectionError::TooManyConnectionAttempts(_) => {
                 "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
             }
             _ => COULD_NOT_CONNECT.to_owned(),
@@ -366,16 +365,16 @@ static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
 struct AcceptEverythingVerifier;
 impl ServerCertVerifier for AcceptEverythingVerifier {
     fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
-        use rustls::SignatureScheme::*;
+        use rustls::SignatureScheme;
         // The schemes for which `SignatureScheme::supported_in_tls13` returns true.
         vec![
-            ECDSA_NISTP521_SHA512,
-            ECDSA_NISTP384_SHA384,
-            ECDSA_NISTP256_SHA256,
-            RSA_PSS_SHA512,
-            RSA_PSS_SHA384,
-            RSA_PSS_SHA256,
-            ED25519,
+            SignatureScheme::ECDSA_NISTP521_SHA512,
+            SignatureScheme::ECDSA_NISTP384_SHA384,
+            SignatureScheme::ECDSA_NISTP256_SHA256,
+            SignatureScheme::RSA_PSS_SHA512,
+            SignatureScheme::RSA_PSS_SHA384,
+            SignatureScheme::RSA_PSS_SHA256,
+            SignatureScheme::ED25519,
         ]
     }
     fn verify_server_cert(
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 1412095505..36d04924f2 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -155,7 +155,7 @@ pub enum TlsServerEndPoint {
 }
 
 impl TlsServerEndPoint {
-    pub fn new(cert: &CertificateDer) -> anyhow::Result<Self> {
+    pub fn new(cert: &CertificateDer<'_>) -> anyhow::Result<Self> {
         let sha256_oids = [
             // I'm explicitly not adding MD5 or SHA1 here... They're bad.
             oid_registry::OID_SIG_ECDSA_WITH_SHA256,
@@ -278,7 +278,7 @@ impl CertResolver {
 impl rustls::server::ResolvesServerCert for CertResolver {
     fn resolve(
         &self,
-        client_hello: rustls::server::ClientHello,
+        client_hello: rustls::server::ClientHello<'_>,
     ) -> Option<Arc<rustls::sign::CertifiedKey>> {
         self.resolve(client_hello.server_name()).map(|x| x.0)
     }
@@ -559,7 +559,7 @@ impl RetryConfig {
             match key {
                 "num_retries" => num_retries = Some(value.parse()?),
                 "base_retry_wait_duration" => {
-                    base_retry_wait_duration = Some(humantime::parse_duration(value)?)
+                    base_retry_wait_duration = Some(humantime::parse_duration(value)?);
                 }
                 "retry_wait_exponent_base" => retry_wait_exponent_base = Some(value.parse()?),
                 unknown => bail!("unknown key: {unknown}"),
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 9abf24ab7f..ac66e116d0 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -22,16 +22,15 @@ impl ConsoleError {
         self.status
             .as_ref()
             .and_then(|s| s.details.error_info.as_ref())
-            .map(|e| e.reason)
-            .unwrap_or(Reason::Unknown)
+            .map_or(Reason::Unknown, |e| e.reason)
     }
+
     pub fn get_user_facing_message(&self) -> String {
         use super::provider::errors::REQUEST_FAILED;
         self.status
             .as_ref()
             .and_then(|s| s.details.user_facing_message.as_ref())
-            .map(|m| m.message.clone().into())
-            .unwrap_or_else(|| {
+            .map_or_else(|| {
                 // Ask @neondatabase/control-plane for review before adding more.
                 match self.http_status_code {
                     http::StatusCode::NOT_FOUND => {
@@ -48,19 +47,18 @@ impl ConsoleError {
                     }
                     _ => REQUEST_FAILED.to_owned(),
                 }
-            })
+            }, |m| m.message.clone().into())
     }
 }
 
 impl Display for ConsoleError {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let msg = self
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let msg: &str = self
             .status
             .as_ref()
             .and_then(|s| s.details.user_facing_message.as_ref())
-            .map(|m| m.message.as_ref())
-            .unwrap_or_else(|| &self.error);
-        write!(f, "{}", msg)
+            .map_or_else(|| self.error.as_ref(), |m| m.message.as_ref());
+        write!(f, "{msg}")
     }
 }
 
@@ -286,7 +284,7 @@ pub struct DatabaseInfo {
 
 // Manually implement debug to omit sensitive info.
 impl fmt::Debug for DatabaseInfo {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.debug_struct("DatabaseInfo")
             .field("host", &self.host)
             .field("port", &self.port)
@@ -373,7 +371,7 @@ mod tests {
                 }
             }
         });
-        let _: KickSession = serde_json::from_str(&json.to_string())?;
+        let _: KickSession<'_> = serde_json::from_str(&json.to_string())?;
 
         Ok(())
     }
diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs
index befe7d7510..82d5033aab 100644
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -93,7 +93,8 @@ impl postgres_backend::Handler<tokio::net::TcpStream> for MgmtHandler {
 }
 
 fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> {
-    let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?;
+    let resp: KickSession<'_> =
+        serde_json::from_str(query).context("Failed to parse query as json")?;
 
     let span = info_span!("event", session_id = resp.session_id);
     let _enter = span.enter();
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 15fc0134b3..cc2ee10062 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -26,7 +26,7 @@ use tracing::info;
 pub mod errors {
     use crate::{
         console::messages::{self, ConsoleError, Reason},
-        error::{io_error, ReportableError, UserFacingError},
+        error::{io_error, ErrorKind, ReportableError, UserFacingError},
         proxy::retry::CouldRetry,
     };
     use thiserror::Error;
@@ -51,21 +51,19 @@ pub mod errors {
     impl ApiError {
         /// Returns HTTP status code if it's the reason for failure.
         pub fn get_reason(&self) -> messages::Reason {
-            use ApiError::*;
             match self {
-                Console(e) => e.get_reason(),
-                _ => messages::Reason::Unknown,
+                ApiError::Console(e) => e.get_reason(),
+                ApiError::Transport(_) => messages::Reason::Unknown,
             }
         }
     }
 
     impl UserFacingError for ApiError {
         fn to_string_client(&self) -> String {
-            use ApiError::*;
             match self {
                 // To minimize risks, only select errors are forwarded to users.
-                Console(c) => c.get_user_facing_message(),
-                _ => REQUEST_FAILED.to_owned(),
+                ApiError::Console(c) => c.get_user_facing_message(),
+                ApiError::Transport(_) => REQUEST_FAILED.to_owned(),
             }
         }
     }
@@ -73,57 +71,53 @@ pub mod errors {
     impl ReportableError for ApiError {
         fn get_error_kind(&self) -> crate::error::ErrorKind {
             match self {
-                ApiError::Console(e) => {
-                    use crate::error::ErrorKind::*;
-                    match e.get_reason() {
-                        Reason::RoleProtected => User,
-                        Reason::ResourceNotFound => User,
-                        Reason::ProjectNotFound => User,
-                        Reason::EndpointNotFound => User,
-                        Reason::BranchNotFound => User,
-                        Reason::RateLimitExceeded => ServiceRateLimit,
-                        Reason::NonDefaultBranchComputeTimeExceeded => User,
-                        Reason::ActiveTimeQuotaExceeded => User,
-                        Reason::ComputeTimeQuotaExceeded => User,
-                        Reason::WrittenDataQuotaExceeded => User,
-                        Reason::DataTransferQuotaExceeded => User,
-                        Reason::LogicalSizeQuotaExceeded => User,
-                        Reason::ConcurrencyLimitReached => ControlPlane,
-                        Reason::LockAlreadyTaken => ControlPlane,
-                        Reason::RunningOperations => ControlPlane,
-                        Reason::Unknown => match &e {
-                            ConsoleError {
-                                http_status_code:
-                                    http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
-                                ..
-                            } => crate::error::ErrorKind::User,
-                            ConsoleError {
-                                http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
-                                error,
-                                ..
-                            } if error.contains(
-                                "compute time quota of non-primary branches is exceeded",
-                            ) =>
-                            {
-                                crate::error::ErrorKind::User
-                            }
-                            ConsoleError {
-                                http_status_code: http::StatusCode::LOCKED,
-                                error,
-                                ..
-                            } if error.contains("quota exceeded")
-                                || error.contains("the limit for current plan reached") =>
-                            {
-                                crate::error::ErrorKind::User
-                            }
-                            ConsoleError {
-                                http_status_code: http::StatusCode::TOO_MANY_REQUESTS,
-                                ..
-                            } => crate::error::ErrorKind::ServiceRateLimit,
-                            ConsoleError { .. } => crate::error::ErrorKind::ControlPlane,
-                        },
-                    }
-                }
+                ApiError::Console(e) => match e.get_reason() {
+                    Reason::RoleProtected => ErrorKind::User,
+                    Reason::ResourceNotFound => ErrorKind::User,
+                    Reason::ProjectNotFound => ErrorKind::User,
+                    Reason::EndpointNotFound => ErrorKind::User,
+                    Reason::BranchNotFound => ErrorKind::User,
+                    Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit,
+                    Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::User,
+                    Reason::ActiveTimeQuotaExceeded => ErrorKind::User,
+                    Reason::ComputeTimeQuotaExceeded => ErrorKind::User,
+                    Reason::WrittenDataQuotaExceeded => ErrorKind::User,
+                    Reason::DataTransferQuotaExceeded => ErrorKind::User,
+                    Reason::LogicalSizeQuotaExceeded => ErrorKind::User,
+                    Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
+                    Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
+                    Reason::RunningOperations => ErrorKind::ControlPlane,
+                    Reason::Unknown => match &e {
+                        ConsoleError {
+                            http_status_code:
+                                http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
+                            ..
+                        } => crate::error::ErrorKind::User,
+                        ConsoleError {
+                            http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
+                            error,
+                            ..
+                        } if error
+                            .contains("compute time quota of non-primary branches is exceeded") =>
+                        {
+                            crate::error::ErrorKind::User
+                        }
+                        ConsoleError {
+                            http_status_code: http::StatusCode::LOCKED,
+                            error,
+                            ..
+                        } if error.contains("quota exceeded")
+                            || error.contains("the limit for current plan reached") =>
+                        {
+                            crate::error::ErrorKind::User
+                        }
+                        ConsoleError {
+                            http_status_code: http::StatusCode::TOO_MANY_REQUESTS,
+                            ..
+                        } => crate::error::ErrorKind::ServiceRateLimit,
+                        ConsoleError { .. } => crate::error::ErrorKind::ControlPlane,
+                    },
+                },
                 ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
             }
         }
@@ -170,12 +164,11 @@ pub mod errors {
 
     impl UserFacingError for GetAuthInfoError {
         fn to_string_client(&self) -> String {
-            use GetAuthInfoError::*;
             match self {
                 // We absolutely should not leak any secrets!
-                BadSecret => REQUEST_FAILED.to_owned(),
+                Self::BadSecret => REQUEST_FAILED.to_owned(),
                 // However, API might return a meaningful error.
-                ApiError(e) => e.to_string_client(),
+                Self::ApiError(e) => e.to_string_client(),
             }
         }
     }
@@ -183,8 +176,8 @@ pub mod errors {
     impl ReportableError for GetAuthInfoError {
         fn get_error_kind(&self) -> crate::error::ErrorKind {
             match self {
-                GetAuthInfoError::BadSecret => crate::error::ErrorKind::ControlPlane,
-                GetAuthInfoError::ApiError(_) => crate::error::ErrorKind::ControlPlane,
+                Self::BadSecret => crate::error::ErrorKind::ControlPlane,
+                Self::ApiError(_) => crate::error::ErrorKind::ControlPlane,
             }
         }
     }
@@ -213,17 +206,16 @@ pub mod errors {
 
     impl UserFacingError for WakeComputeError {
         fn to_string_client(&self) -> String {
-            use WakeComputeError::*;
             match self {
                 // We shouldn't show user the address even if it's broken.
                 // Besides, user is unlikely to care about this detail.
-                BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
+                Self::BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
                 // However, API might return a meaningful error.
-                ApiError(e) => e.to_string_client(),
+                Self::ApiError(e) => e.to_string_client(),
 
-                TooManyConnections => self.to_string(),
+                Self::TooManyConnections => self.to_string(),
 
-                TooManyConnectionAttempts(_) => {
+                Self::TooManyConnectionAttempts(_) => {
                     "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
                 }
             }
@@ -233,10 +225,10 @@ pub mod errors {
     impl ReportableError for WakeComputeError {
         fn get_error_kind(&self) -> crate::error::ErrorKind {
             match self {
-                WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
-                WakeComputeError::ApiError(e) => e.get_error_kind(),
-                WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit,
-                WakeComputeError::TooManyConnectionAttempts(e) => e.get_error_kind(),
+                Self::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
+                Self::ApiError(e) => e.get_error_kind(),
+                Self::TooManyConnections => crate::error::ErrorKind::RateLimit,
+                Self::TooManyConnectionAttempts(e) => e.get_error_kind(),
             }
         }
     }
@@ -244,10 +236,10 @@ pub mod errors {
     impl CouldRetry for WakeComputeError {
         fn could_retry(&self) -> bool {
             match self {
-                WakeComputeError::BadComputeAddress(_) => false,
-                WakeComputeError::ApiError(e) => e.could_retry(),
-                WakeComputeError::TooManyConnections => false,
-                WakeComputeError::TooManyConnectionAttempts(_) => false,
+                Self::BadComputeAddress(_) => false,
+                Self::ApiError(e) => e.could_retry(),
+                Self::TooManyConnections => false,
+                Self::TooManyConnectionAttempts(_) => false,
             }
         }
     }
@@ -366,13 +358,14 @@ impl Api for ConsoleBackend {
         ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
-        use ConsoleBackend::*;
         match self {
-            Console(api) => api.get_role_secret(ctx, user_info).await,
+            Self::Console(api) => api.get_role_secret(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
-            Postgres(api) => api.get_role_secret(ctx, user_info).await,
+            Self::Postgres(api) => api.get_role_secret(ctx, user_info).await,
             #[cfg(test)]
-            Test(_) => unreachable!("this function should never be called in the test backend"),
+            Self::Test(_) => {
+                unreachable!("this function should never be called in the test backend")
+            }
         }
     }
 
@@ -381,13 +374,12 @@ impl Api for ConsoleBackend {
         ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
-        use ConsoleBackend::*;
         match self {
-            Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
-            Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             #[cfg(test)]
-            Test(api) => api.get_allowed_ips_and_secret(),
+            Self::Test(api) => api.get_allowed_ips_and_secret(),
         }
     }
 
@@ -396,14 +388,12 @@ impl Api for ConsoleBackend {
         ctx: &RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
-        use ConsoleBackend::*;
-
         match self {
-            Console(api) => api.wake_compute(ctx, user_info).await,
+            Self::Console(api) => api.wake_compute(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
-            Postgres(api) => api.wake_compute(ctx, user_info).await,
+            Self::Postgres(api) => api.wake_compute(ctx, user_info).await,
             #[cfg(test)]
-            Test(api) => api.wake_compute(),
+            Self::Test(api) => api.wake_compute(),
         }
     }
 }
@@ -549,7 +539,7 @@ impl WakeComputePermit {
         !self.permit.is_disabled()
     }
     pub fn release(self, outcome: Outcome) {
-        self.permit.release(outcome)
+        self.permit.release(outcome);
     }
     pub fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
         match res {
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index e925f67233..cafbdedc15 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -166,7 +166,7 @@ impl RequestMonitoring {
     pub fn set_project(&self, x: MetricsAuxInfo) {
         let mut this = self.0.try_lock().expect("should not deadlock");
         if this.endpoint_id.is_none() {
-            this.set_endpoint_id(x.endpoint_id.as_str().into())
+            this.set_endpoint_id(x.endpoint_id.as_str().into());
         }
         this.branch = Some(x.branch_id);
         this.project = Some(x.project_id);
@@ -260,7 +260,7 @@ impl RequestMonitoring {
             .cold_start_info
     }
 
-    pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause {
+    pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
         LatencyTimerPause {
             ctx: self,
             start: tokio::time::Instant::now(),
@@ -273,7 +273,7 @@ impl RequestMonitoring {
             .try_lock()
             .expect("should not deadlock")
             .latency_timer
-            .success()
+            .success();
     }
 }
 
@@ -328,7 +328,7 @@ impl RequestMonitoringInner {
     fn has_private_peer_addr(&self) -> bool {
         match self.peer_addr {
             IpAddr::V4(ip) => ip.is_private(),
-            _ => false,
+            IpAddr::V6(_) => false,
         }
     }
 
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index bb02a476fc..e5962b35fa 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -736,7 +736,7 @@ mod tests {
                 while let Some(r) = s.next().await {
                     tx.send(r).unwrap();
                 }
-                time::sleep(time::Duration::from_secs(70)).await
+                time::sleep(time::Duration::from_secs(70)).await;
             }
         });
 
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index e38135dd22..d418caa511 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -56,7 +56,7 @@ impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString<Id> {
         impl<'de, Id: InternId> serde::de::Visitor<'de> for Visitor<Id> {
             type Value = InternedString<Id>;
 
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+            fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
                 formatter.write_str("a string")
             }
 
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 0167553e30..ccef88231b 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -252,7 +252,7 @@ impl Drop for HttpEndpointPoolsGuard<'_> {
 }
 
 impl HttpEndpointPools {
-    pub fn guard(&self) -> HttpEndpointPoolsGuard {
+    pub fn guard(&self) -> HttpEndpointPoolsGuard<'_> {
         self.http_pool_endpoints_registered_total.inc();
         HttpEndpointPoolsGuard {
             dec: &self.http_pool_endpoints_unregistered_total,
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index 3c45fff969..048523f69c 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -184,7 +184,7 @@ impl CopyBuffer {
                 }
                 Poll::Pending
             }
-            res => res.map_err(ErrorDirection::Write),
+            res @ Poll::Ready(_) => res.map_err(ErrorDirection::Write),
         }
     }
 
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index c65a5558d9..27a72f8072 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -82,9 +82,8 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
     let mut stream = PqStream::new(Stream::from_raw(stream));
     loop {
         let msg = stream.read_startup_packet().await?;
-        use FeStartupPacket::*;
         match msg {
-            SslRequest { direct } => match stream.get_ref() {
+            FeStartupPacket::SslRequest { direct } => match stream.get_ref() {
                 Stream::Raw { .. } if !tried_ssl => {
                     tried_ssl = true;
 
@@ -139,7 +138,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
 
                         let tls_stream = accept.await.inspect_err(|_| {
                             if record_handshake_error {
-                                Metrics::get().proxy.tls_handshake_failures.inc()
+                                Metrics::get().proxy.tls_handshake_failures.inc();
                             }
                         })?;
 
@@ -182,7 +181,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 }
                 _ => return Err(HandshakeError::ProtocolViolation),
             },
-            GssEncRequest => match stream.get_ref() {
+            FeStartupPacket::GssEncRequest => match stream.get_ref() {
                 Stream::Raw { .. } if !tried_gss => {
                     tried_gss = true;
 
@@ -191,7 +190,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 }
                 _ => return Err(HandshakeError::ProtocolViolation),
             },
-            StartupMessage { params, version }
+            FeStartupPacket::StartupMessage { params, version }
                 if PG_PROTOCOL_EARLIEST <= version && version <= PG_PROTOCOL_LATEST =>
             {
                 // Check that the config has been consumed during upgrade
@@ -211,7 +210,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 break Ok(HandshakeData::Startup(stream, params));
             }
             // downgrade protocol version
-            StartupMessage { params, version }
+            FeStartupPacket::StartupMessage { params, version }
                 if version.major() == 3 && version > PG_PROTOCOL_LATEST =>
             {
                 warn!(?version, "unsupported minor version");
@@ -241,7 +240,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 );
                 break Ok(HandshakeData::Startup(stream, params));
             }
-            StartupMessage { version, .. } => {
+            FeStartupPacket::StartupMessage { version, .. } => {
                 warn!(
                     ?version,
                     session_type = "normal",
@@ -249,7 +248,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 );
                 return Err(HandshakeError::ProtocolViolation);
             }
-            CancelRequest(cancel_key_data) => {
+            FeStartupPacket::CancelRequest(cancel_key_data) => {
                 info!(session_type = "cancellation", "successful handshake");
                 break Ok(HandshakeData::Cancel(cancel_key_data));
             }
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index c8ec2b2db6..2d752b9183 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -68,7 +68,7 @@ async fn proxy_mitm(
                                 end_client.send(Bytes::from_static(b"R\0\0\0\x17\0\0\0\x0aSCRAM-SHA-256\0\0")).await.unwrap();
                                 continue;
                             }
-                            end_client.send(message).await.unwrap()
+                            end_client.send(message).await.unwrap();
                         }
                         _ => break,
                     }
@@ -88,7 +88,7 @@ async fn proxy_mitm(
                                 end_server.send(buf.freeze()).await.unwrap();
                                 continue;
                             }
-                            end_server.send(message).await.unwrap()
+                            end_server.send(message).await.unwrap();
                         }
                         _ => break,
                     }
diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs
index 3842ce269e..80a62b2a76 100644
--- a/proxy/src/rate_limiter/limit_algorithm.rs
+++ b/proxy/src/rate_limiter/limit_algorithm.rs
@@ -237,7 +237,7 @@ impl Token {
     }
 
     pub fn release(mut self, outcome: Outcome) {
-        self.release_mut(Some(outcome))
+        self.release_mut(Some(outcome));
     }
 
     pub fn release_mut(&mut self, outcome: Option<Outcome>) {
@@ -249,7 +249,7 @@ impl Token {
 
 impl Drop for Token {
     fn drop(&mut self) {
-        self.release_mut(None)
+        self.release_mut(None);
     }
 }
 
diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
index b39740bb21..d669492fa6 100644
--- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs
+++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
@@ -25,9 +25,8 @@ pub struct Aimd {
 
 impl LimitAlgorithm for Aimd {
     fn update(&self, old_limit: usize, sample: Sample) -> usize {
-        use Outcome::*;
         match sample.outcome {
-            Success => {
+            Outcome::Success => {
                 let utilisation = sample.in_flight as f32 / old_limit as f32;
 
                 if utilisation > self.utilisation {
@@ -42,7 +41,7 @@ impl LimitAlgorithm for Aimd {
                     old_limit
                 }
             }
-            Overload => {
+            Outcome::Overload => {
                 let limit = old_limit as f32 * self.dec;
 
                 // Floor instead of round, so the limit reduces even with small numbers.
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index b02ce472c0..c78ee166f1 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -98,7 +98,7 @@ impl ConnectionWithCredentialsProvider {
         info!("Establishing a new connection...");
         self.con = None;
         if let Some(f) = self.refresh_token_task.take() {
-            f.abort()
+            f.abort();
         }
         let mut con = self
             .get_client()
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index efd7437d5d..ad69246443 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -108,7 +108,6 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
     }
     #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))]
     async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> {
-        use Notification::*;
         let payload: String = msg.get_payload()?;
         tracing::debug!(?payload, "received a message payload");
 
@@ -124,7 +123,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
         };
         tracing::debug!(?msg, "received a message");
         match msg {
-            Cancel(cancel_session) => {
+            Notification::Cancel(cancel_session) => {
                 tracing::Span::current().record(
                     "session_id",
                     tracing::field::display(cancel_session.session_id),
@@ -153,12 +152,12 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
             }
             _ => {
                 invalidate_cache(self.cache.clone(), msg.clone());
-                if matches!(msg, AllowedIpsUpdate { .. }) {
+                if matches!(msg, Notification::AllowedIpsUpdate { .. }) {
                     Metrics::get()
                         .proxy
                         .redis_events_count
                         .inc(RedisEventsCount::AllowedIpsUpdate);
-                } else if matches!(msg, PasswordUpdate { .. }) {
+                } else if matches!(msg, Notification::PasswordUpdate { .. }) {
                     Metrics::get()
                         .proxy
                         .redis_events_count
@@ -180,16 +179,16 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
 }
 
 fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
-    use Notification::*;
     match msg {
-        AllowedIpsUpdate { allowed_ips_update } => {
-            cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id)
+        Notification::AllowedIpsUpdate { allowed_ips_update } => {
+            cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id);
         }
-        PasswordUpdate { password_update } => cache.invalidate_role_secret_for_project(
-            password_update.project_id,
-            password_update.role_name,
-        ),
-        Cancel(_) => unreachable!("cancel message should be handled separately"),
+        Notification::PasswordUpdate { password_update } => cache
+            .invalidate_role_secret_for_project(
+                password_update.project_id,
+                password_update.role_name,
+            ),
+        Notification::Cancel(_) => unreachable!("cancel message should be handled separately"),
     }
 }
 
diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs
index 0811416ca2..60207fc824 100644
--- a/proxy/src/sasl.rs
+++ b/proxy/src/sasl.rs
@@ -42,10 +42,9 @@ pub enum Error {
 
 impl UserFacingError for Error {
     fn to_string_client(&self) -> String {
-        use Error::*;
         match self {
-            ChannelBindingFailed(m) => m.to_string(),
-            ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"),
+            Self::ChannelBindingFailed(m) => (*m).to_string(),
+            Self::ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"),
             _ => "authentication protocol violation".to_string(),
         }
     }
diff --git a/proxy/src/sasl/channel_binding.rs b/proxy/src/sasl/channel_binding.rs
index 13d681de6d..6e2d3057ce 100644
--- a/proxy/src/sasl/channel_binding.rs
+++ b/proxy/src/sasl/channel_binding.rs
@@ -13,11 +13,10 @@ pub enum ChannelBinding<T> {
 
 impl<T> ChannelBinding<T> {
     pub fn and_then<R, E>(self, f: impl FnOnce(T) -> Result<R, E>) -> Result<ChannelBinding<R>, E> {
-        use ChannelBinding::*;
         Ok(match self {
-            NotSupportedClient => NotSupportedClient,
-            NotSupportedServer => NotSupportedServer,
-            Required(x) => Required(f(x)?),
+            Self::NotSupportedClient => ChannelBinding::NotSupportedClient,
+            Self::NotSupportedServer => ChannelBinding::NotSupportedServer,
+            Self::Required(x) => ChannelBinding::Required(f(x)?),
         })
     }
 }
@@ -25,11 +24,10 @@ impl<T> ChannelBinding<T> {
 impl<'a> ChannelBinding<&'a str> {
     // NB: FromStr doesn't work with lifetimes
     pub fn parse(input: &'a str) -> Option<Self> {
-        use ChannelBinding::*;
         Some(match input {
-            "n" => NotSupportedClient,
-            "y" => NotSupportedServer,
-            other => Required(other.strip_prefix("p=")?),
+            "n" => Self::NotSupportedClient,
+            "y" => Self::NotSupportedServer,
+            other => Self::Required(other.strip_prefix("p=")?),
         })
     }
 }
@@ -40,17 +38,16 @@ impl<T: std::fmt::Display> ChannelBinding<T> {
         &self,
         get_cbind_data: impl FnOnce(&T) -> Result<&'a [u8], E>,
     ) -> Result<std::borrow::Cow<'static, str>, E> {
-        use ChannelBinding::*;
         Ok(match self {
-            NotSupportedClient => {
+            Self::NotSupportedClient => {
                 // base64::encode("n,,")
                 "biws".into()
             }
-            NotSupportedServer => {
+            Self::NotSupportedServer => {
                 // base64::encode("y,,")
                 "eSws".into()
             }
-            Required(mode) => {
+            Self::Required(mode) => {
                 use std::io::Write;
                 let mut cbind_input = vec![];
                 write!(&mut cbind_input, "p={mode},,",).unwrap();
diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs
index b9208f6f1f..2b5ae1785d 100644
--- a/proxy/src/sasl/messages.rs
+++ b/proxy/src/sasl/messages.rs
@@ -42,10 +42,9 @@ pub(super) enum ServerMessage<T> {
 
 impl<'a> ServerMessage<&'a str> {
     pub(super) fn to_reply(&self) -> BeMessage<'a> {
-        use BeAuthenticationSaslMessage::*;
         BeMessage::AuthenticationSasl(match self {
-            ServerMessage::Continue(s) => Continue(s.as_bytes()),
-            ServerMessage::Final(s) => Final(s.as_bytes()),
+            ServerMessage::Continue(s) => BeAuthenticationSaslMessage::Continue(s.as_bytes()),
+            ServerMessage::Final(s) => BeAuthenticationSaslMessage::Final(s.as_bytes()),
         })
     }
 }
diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs
index 862facb4e5..145e727a74 100644
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -137,12 +137,12 @@ mod tests {
 
     #[tokio::test]
     async fn round_trip() {
-        run_round_trip_test("pencil", "pencil").await
+        run_round_trip_test("pencil", "pencil").await;
     }
 
     #[tokio::test]
     #[should_panic(expected = "password doesn't match")]
     async fn failure() {
-        run_round_trip_test("pencil", "eraser").await
+        run_round_trip_test("pencil", "eraser").await;
     }
 }
diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs
index e8e7ef5c86..944bb3c83e 100644
--- a/proxy/src/scram/countmin.rs
+++ b/proxy/src/scram/countmin.rs
@@ -98,8 +98,6 @@ mod tests {
         // q% of counts will be within p of the actual value
         let mut sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
 
-        dbg!(sketch.buckets.len());
-
         // insert a bunch of entries in a random order
         let mut ids2 = ids.clone();
         while !ids2.is_empty() {
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index d0adbc780e..f2494379a5 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -210,23 +210,23 @@ impl sasl::Mechanism for Exchange<'_> {
     type Output = super::ScramKey;
 
     fn exchange(mut self, input: &str) -> sasl::Result<sasl::Step<Self, Self::Output>> {
-        use {sasl::Step::*, ExchangeState::*};
+        use {sasl::Step, ExchangeState};
         match &self.state {
-            Initial(init) => {
+            ExchangeState::Initial(init) => {
                 match init.transition(self.secret, &self.tls_server_end_point, input)? {
-                    Continue(sent, msg) => {
-                        self.state = SaltSent(sent);
-                        Ok(Continue(self, msg))
+                    Step::Continue(sent, msg) => {
+                        self.state = ExchangeState::SaltSent(sent);
+                        Ok(Step::Continue(self, msg))
                     }
-                    Success(x, _) => match x {},
-                    Failure(msg) => Ok(Failure(msg)),
+                    Step::Success(x, _) => match x {},
+                    Step::Failure(msg) => Ok(Step::Failure(msg)),
                 }
             }
-            SaltSent(sent) => {
+            ExchangeState::SaltSent(sent) => {
                 match sent.transition(self.secret, &self.tls_server_end_point, input)? {
-                    Success(keys, msg) => Ok(Success(keys, msg)),
-                    Continue(x, _) => match x {},
-                    Failure(msg) => Ok(Failure(msg)),
+                    Step::Success(keys, msg) => Ok(Step::Success(keys, msg)),
+                    Step::Continue(x, _) => match x {},
+                    Step::Failure(msg) => Ok(Step::Failure(msg)),
                 }
             }
         }
diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs
index cf677a3334..5ecbbf7004 100644
--- a/proxy/src/scram/messages.rs
+++ b/proxy/src/scram/messages.rs
@@ -59,7 +59,7 @@ impl<'a> ClientFirstMessage<'a> {
 
         // https://github.com/postgres/postgres/blob/f83908798f78c4cafda217ca875602c88ea2ae28/src/backend/libpq/auth-scram.c#L13-L14
         if !username.is_empty() {
-            tracing::warn!(username, "scram username provided, but is not expected")
+            tracing::warn!(username, "scram username provided, but is not expected");
             // TODO(conrad):
             // return None;
         }
@@ -137,7 +137,7 @@ impl<'a> ClientFinalMessage<'a> {
     /// Build a response to [`ClientFinalMessage`].
     pub fn build_server_final_message(
         &self,
-        signature_builder: SignatureBuilder,
+        signature_builder: SignatureBuilder<'_>,
         server_key: &ScramKey,
     ) -> String {
         let mut buf = String::from("v=");
@@ -212,7 +212,7 @@ mod tests {
 
     #[test]
     fn parse_client_first_message_with_invalid_gs2_authz() {
-        assert!(ClientFirstMessage::parse("n,authzid,n=,r=nonce").is_none())
+        assert!(ClientFirstMessage::parse("n,authzid,n=,r=nonce").is_none());
     }
 
     #[test]
diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs
index a803ba7e1b..f690cc7738 100644
--- a/proxy/src/scram/pbkdf2.rs
+++ b/proxy/src/scram/pbkdf2.rs
@@ -84,6 +84,6 @@ mod tests {
         };
 
         let expected = pbkdf2_hmac_array::<Sha256, 32>(pass, salt, 600000);
-        assert_eq!(hash, expected)
+        assert_eq!(hash, expected);
     }
 }
diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs
index 7701b869a3..fa3d3ccca2 100644
--- a/proxy/src/scram/threadpool.rs
+++ b/proxy/src/scram/threadpool.rs
@@ -270,7 +270,7 @@ fn thread_rt(pool: Arc<ThreadPool>, worker: Worker<JobSpec>, index: usize) {
                         .inc(ThreadPoolWorkerId(index));
 
                     // skip for now
-                    worker.push(job)
+                    worker.push(job);
                 }
             }
 
@@ -316,6 +316,6 @@ mod tests {
             10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242,
             178, 43, 95, 8, 225, 182, 122, 40, 219, 21, 89, 147, 64, 140,
         ];
-        assert_eq!(actual, expected)
+        assert_eq!(actual, expected);
     }
 }
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 115bef7375..5416d63b5b 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -120,7 +120,7 @@ pub async fn task_main(
             tracing::trace!("attempting to cancel a random connection");
             if let Some(token) = config.http_config.cancel_set.take() {
                 tracing::debug!("cancelling a random connection");
-                token.cancel()
+                token.cancel();
             }
         }
 
@@ -198,7 +198,7 @@ async fn connection_startup(
     let peer_addr = peer.unwrap_or(peer_addr).ip();
     let has_private_peer_addr = match peer_addr {
         IpAddr::V4(ip) => ip.is_private(),
-        _ => false,
+        IpAddr::V6(_) => false,
     };
     info!(?session_id, %peer_addr, "accepted new TCP connection");
 
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index e1dc44dc1c..9ede659cc4 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -390,7 +390,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
             .write()
             .get_conn_entry(conn_info.db_and_user())
         {
-            client = Some(entry.conn)
+            client = Some(entry.conn);
         }
         let endpoint_pool = Arc::downgrade(&endpoint_pool);
 
@@ -662,13 +662,13 @@ impl<C: ClientInnerExt> Discard<'_, C> {
     pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
         let conn_info = &self.conn_info;
         if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
-            info!("pool: throwing away connection '{conn_info}' because connection is not idle")
+            info!("pool: throwing away connection '{conn_info}' because connection is not idle");
         }
     }
     pub fn discard(&mut self) {
         let conn_info = &self.conn_info;
         if std::mem::take(self.pool).strong_count() > 0 {
-            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
+            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
         }
     }
 }
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 690e92ffb1..7809d2e574 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -234,7 +234,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Stream<S> {
                 .await
                 .inspect_err(|_| {
                     if record_handshake_error {
-                        Metrics::get().proxy.tls_handshake_failures.inc()
+                        Metrics::get().proxy.tls_handshake_failures.inc();
                     }
                 })?),
             Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls),
diff --git a/proxy/src/url.rs b/proxy/src/url.rs
index 92c64bb8ad..202fe8de1f 100644
--- a/proxy/src/url.rs
+++ b/proxy/src/url.rs
@@ -12,7 +12,7 @@ impl ApiUrl {
     }
 
     /// See [`url::Url::path_segments_mut`].
-    pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut {
+    pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut<'_> {
         // We've already verified that it works during construction.
         self.0.path_segments_mut().expect("bad API url")
     }
diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs
index 888ad38048..3bd8f4c8ef 100644
--- a/proxy/src/waiters.rs
+++ b/proxy/src/waiters.rs
@@ -36,7 +36,7 @@ impl<T> Default for Waiters<T> {
 }
 
 impl<T> Waiters<T> {
-    pub fn register(&self, key: String) -> Result<Waiter<T>, RegisterError> {
+    pub fn register(&self, key: String) -> Result<Waiter<'_, T>, RegisterError> {
         let (tx, rx) = oneshot::channel();
 
         self.0

From eb7241c798d445cd7bcb52d14fbf6c59f4a54d32 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 19 Aug 2024 16:35:34 +0200
Subject: [PATCH 02/55] l0_flush: remove support for mode `page-cached` (#8739)

It's been rolled out everywhere, no configs are referencing it.

All code that's made dead by the removal of the config option is removed
as part of this PR.

The `page_caching::PreWarmingWriter` in `::No` mode is equivalent to a
`size_tracking_writer`, so, use that.

part of https://github.com/neondatabase/neon/issues/7418
---
 pageserver/src/l0_flush.rs                    |  19 +-
 pageserver/src/tenant/ephemeral_file.rs       |   5 +-
 .../src/tenant/ephemeral_file/page_caching.rs | 169 ++----------------
 .../tenant/storage_layer/inmemory_layer.rs    |  68 +------
 4 files changed, 20 insertions(+), 241 deletions(-)

diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs
index 10187f2ba3..313a7961a6 100644
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -1,15 +1,10 @@
 use std::{num::NonZeroUsize, sync::Arc};
 
-use crate::tenant::ephemeral_file;
-
 #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
 pub enum L0FlushConfig {
-    PageCached,
     #[serde(rename_all = "snake_case")]
-    Direct {
-        max_concurrency: NonZeroUsize,
-    },
+    Direct { max_concurrency: NonZeroUsize },
 }
 
 impl Default for L0FlushConfig {
@@ -25,14 +20,12 @@ impl Default for L0FlushConfig {
 pub struct L0FlushGlobalState(Arc<Inner>);
 
 pub enum Inner {
-    PageCached,
     Direct { semaphore: tokio::sync::Semaphore },
 }
 
 impl L0FlushGlobalState {
     pub fn new(config: L0FlushConfig) -> Self {
         match config {
-            L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
             L0FlushConfig::Direct { max_concurrency } => {
                 let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
                 Self(Arc::new(Inner::Direct { semaphore }))
@@ -44,13 +37,3 @@ impl L0FlushGlobalState {
         &self.0
     }
 }
-
-impl L0FlushConfig {
-    pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
-        use L0FlushConfig::*;
-        match self {
-            PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
-            Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
-        }
-    }
-}
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 770f3ca5f0..3eb8384d05 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -21,7 +21,6 @@ pub struct EphemeralFile {
 }
 
 mod page_caching;
-pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
 mod zero_padded_read_write;
 
 impl EphemeralFile {
@@ -52,12 +51,10 @@ impl EphemeralFile {
         )
         .await?;
 
-        let prewarm = conf.l0_flush.prewarm_on_write();
-
         Ok(EphemeralFile {
             _tenant_shard_id: tenant_shard_id,
             _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file, prewarm, gate_guard),
+            rw: page_caching::RW::new(file, gate_guard),
         })
     }
 
diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs
index 7355b3b5a3..48926354f1 100644
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -1,15 +1,15 @@
 //! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
 //! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
+//!
+//! Subject to removal in <https://github.com/neondatabase/neon/pull/8537>
 
 use crate::context::RequestContext;
 use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::BlockLease;
-use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
+use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
 use crate::virtual_file::VirtualFile;
 
-use once_cell::sync::Lazy;
-use std::io::{self, ErrorKind};
-use std::ops::{Deref, Range};
+use std::io::{self};
 use tokio_epoll_uring::BoundedBuf;
 use tracing::*;
 
@@ -18,33 +18,17 @@ use super::zero_padded_read_write;
 /// See module-level comment.
 pub struct RW {
     page_cache_file_id: page_cache::FileId,
-    rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
+    rw: super::zero_padded_read_write::RW<size_tracking_writer::Writer<VirtualFile>>,
     /// Gate guard is held on as long as we need to do operations in the path (delete on drop).
     _gate_guard: utils::sync::gate::GateGuard,
 }
 
-/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
-/// should we pre-warm the [`crate::page_cache`] with the contents?
-#[derive(Clone, Copy)]
-pub enum PrewarmOnWrite {
-    Yes,
-    No,
-}
-
 impl RW {
-    pub fn new(
-        file: VirtualFile,
-        prewarm_on_write: PrewarmOnWrite,
-        _gate_guard: utils::sync::gate::GateGuard,
-    ) -> Self {
+    pub fn new(file: VirtualFile, _gate_guard: utils::sync::gate::GateGuard) -> Self {
         let page_cache_file_id = page_cache::next_file_id();
         Self {
             page_cache_file_id,
-            rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
-                page_cache_file_id,
-                file,
-                prewarm_on_write,
-            )),
+            rw: super::zero_padded_read_write::RW::new(size_tracking_writer::Writer::new(file)),
             _gate_guard,
         }
     }
@@ -84,10 +68,10 @@ impl RW {
         let vec = Vec::with_capacity(size);
 
         // read from disk what we've already flushed
-        let writer = self.rw.as_writer();
-        let flushed_range = writer.written_range();
-        let mut vec = writer
-            .file
+        let file_size_tracking_writer = self.rw.as_writer();
+        let flushed_range = 0..usize::try_from(file_size_tracking_writer.bytes_written()).unwrap();
+        let mut vec = file_size_tracking_writer
+            .as_inner()
             .read_exact_at(
                 vec.slice(0..(flushed_range.end - flushed_range.start)),
                 u64::try_from(flushed_range.start).unwrap(),
@@ -122,7 +106,7 @@ impl RW {
                             format!(
                                 "ephemeral file: read immutable page #{}: {}: {:#}",
                                 blknum,
-                                self.rw.as_writer().file.path,
+                                self.rw.as_writer().as_inner().path,
                                 e,
                             ),
                         )
@@ -132,7 +116,7 @@ impl RW {
                     }
                     page_cache::ReadBufResult::NotFound(write_guard) => {
                         let write_guard = writer
-                            .file
+                            .as_inner()
                             .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
                             .await?;
                         let read_guard = write_guard.mark_valid();
@@ -154,137 +138,16 @@ impl Drop for RW {
 
         // unlink the file
         // we are clear to do this, because we have entered a gate
-        let res = std::fs::remove_file(&self.rw.as_writer().file.path);
+        let path = &self.rw.as_writer().as_inner().path;
+        let res = std::fs::remove_file(path);
         if let Err(e) = res {
             if e.kind() != std::io::ErrorKind::NotFound {
                 // just never log the not found errors, we cannot do anything for them; on detach
                 // the tenant directory is already gone.
                 //
                 // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!(
-                    "could not remove ephemeral file '{}': {}",
-                    self.rw.as_writer().file.path,
-                    e
-                );
+                error!("could not remove ephemeral file '{path}': {e}");
             }
         }
     }
 }
-
-struct PreWarmingWriter {
-    prewarm_on_write: PrewarmOnWrite,
-    nwritten_blocks: u32,
-    page_cache_file_id: page_cache::FileId,
-    file: VirtualFile,
-}
-
-impl PreWarmingWriter {
-    fn new(
-        page_cache_file_id: page_cache::FileId,
-        file: VirtualFile,
-        prewarm_on_write: PrewarmOnWrite,
-    ) -> Self {
-        Self {
-            prewarm_on_write,
-            nwritten_blocks: 0,
-            page_cache_file_id,
-            file,
-        }
-    }
-
-    /// Return the byte range within `file` that has been written though `write_all`.
-    ///
-    /// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`.
-    fn written_range(&self) -> (impl Deref<Target = Range<usize>> + '_) {
-        let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap();
-        struct Wrapper(Range<usize>);
-        impl Deref for Wrapper {
-            type Target = Range<usize>;
-            fn deref(&self) -> &Range<usize> {
-                &self.0
-            }
-        }
-        Wrapper(0..nwritten_blocks * PAGE_SZ)
-    }
-}
-
-impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
-    async fn write_all<Buf: tokio_epoll_uring::IoBuf + Send>(
-        &mut self,
-        buf: FullSlice<Buf>,
-        ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-        let buflen = buf.len();
-        assert_eq!(
-            buflen % PAGE_SZ,
-            0,
-            "{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used"
-        );
-
-        // Do the IO.
-        let buf = match self.file.write_all(buf, ctx).await {
-            (buf, Ok(nwritten)) => {
-                assert_eq!(nwritten, buflen);
-                buf
-            }
-            (_, Err(e)) => {
-                return Err(std::io::Error::new(
-                    ErrorKind::Other,
-                    // order error before path because path is long and error is short
-                    format!(
-                        "ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}",
-                        self.nwritten_blocks, buflen, e, self.file.path,
-                    ),
-                ));
-            }
-        };
-
-        let nblocks = buflen / PAGE_SZ;
-        let nblocks32 = u32::try_from(nblocks).unwrap();
-
-        if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) {
-            // Pre-warm page cache with the contents.
-            // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
-            // benefits the code that writes InMemoryLayer=>L0 layers.
-
-            let cache = page_cache::get();
-            static CTX: Lazy<RequestContext> = Lazy::new(|| {
-                RequestContext::new(
-                    crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
-                    crate::context::DownloadBehavior::Error,
-                )
-            });
-            for blknum_in_buffer in 0..nblocks {
-                let blk_in_buffer =
-                    &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
-                let blknum = self
-                    .nwritten_blocks
-                    .checked_add(blknum_in_buffer as u32)
-                    .unwrap();
-                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
-                    .await
-                {
-                    Err(e) => {
-                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
-                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
-                    }
-                    Ok(v) => match v {
-                        page_cache::ReadBufResult::Found(_guard) => {
-                            // This function takes &mut self, so, it shouldn't be possible to reach this point.
-                            unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
-                                      and this function takes &mut self, so, no concurrent read_blk is possible");
-                        }
-                        page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                            write_guard.copy_from_slice(blk_in_buffer);
-                            let _ = write_guard.mark_valid();
-                        }
-                    },
-                }
-            }
-        }
-
-        self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
-        Ok((buflen, buf))
-    }
-}
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 748d79c149..130d1002a0 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -13,7 +13,7 @@ use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
-use crate::{l0_flush, page_cache, walrecord};
+use crate::{l0_flush, page_cache};
 use anyhow::{anyhow, Result};
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
@@ -249,9 +249,7 @@ impl InMemoryLayer {
     /// debugging function to print out the contents of the layer
     ///
     /// this is likely completly unused
-    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        let inner = self.inner.read().await;
-
+    pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
         let end_str = self.end_lsn_or_max();
 
         println!(
@@ -259,39 +257,6 @@ impl InMemoryLayer {
             self.timeline_id, self.start_lsn, end_str,
         );
 
-        if !verbose {
-            return Ok(());
-        }
-
-        let cursor = inner.file.block_cursor();
-        let mut buf = Vec::new();
-        for (key, vec_map) in inner.index.iter() {
-            for (lsn, pos) in vec_map.as_slice() {
-                let mut desc = String::new();
-                cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
-                let val = Value::des(&buf);
-                match val {
-                    Ok(Value::Image(img)) => {
-                        write!(&mut desc, " img {} bytes", img.len())?;
-                    }
-                    Ok(Value::WalRecord(rec)) => {
-                        let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
-                        write!(
-                            &mut desc,
-                            " rec {} bytes will_init: {} {}",
-                            buf.len(),
-                            rec.will_init(),
-                            wal_desc
-                        )?;
-                    }
-                    Err(err) => {
-                        write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
-                    }
-                }
-                println!("  key {} at {}: {}", key, lsn, desc);
-            }
-        }
-
         Ok(())
     }
 
@@ -536,7 +501,6 @@ impl InMemoryLayer {
 
         use l0_flush::Inner;
         let _concurrency_permit = match l0_flush_global_state {
-            Inner::PageCached => None,
             Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
         };
 
@@ -568,34 +532,6 @@ impl InMemoryLayer {
         .await?;
 
         match l0_flush_global_state {
-            l0_flush::Inner::PageCached => {
-                let ctx = RequestContextBuilder::extend(ctx)
-                    .page_content_kind(PageContentKind::InMemoryLayer)
-                    .build();
-
-                let mut buf = Vec::new();
-
-                let cursor = inner.file.block_cursor();
-
-                for (key, vec_map) in inner.index.iter() {
-                    // Write all page versions
-                    for (lsn, pos) in vec_map.as_slice() {
-                        cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
-                        let will_init = Value::des(&buf)?.will_init();
-                        let (tmp, res) = delta_layer_writer
-                            .put_value_bytes(
-                                Key::from_compact(*key),
-                                *lsn,
-                                buf.slice_len(),
-                                will_init,
-                                &ctx,
-                            )
-                            .await;
-                        res?;
-                        buf = tmp.into_raw_slice().into_inner();
-                    }
-                }
-            }
             l0_flush::Inner::Direct { .. } => {
                 let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
                 assert_eq!(

From 3b8ca477ab6852143f8acb5b8217e5f24e9e8605 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 19 Aug 2024 16:39:44 +0200
Subject: [PATCH 03/55] Migrate physical GC and scan_metadata to remote_storage
 (#8673)

Migrates most of the remaining parts of the scrubber to remote_storage:

* `pageserver_physical_gc`
* `scan_metadata` for pageservers (safekeepers were done in #8595)
* `download()` in `tenant_snapshot`. The main `tenant_snapshot` is not
migrated as it uses version history to be able to work in the face of
ongoing changes.

Part of #7547
---
 libs/remote_storage/src/azure_blob.rs         |  42 +++++
 libs/remote_storage/src/lib.rs                |  24 ++-
 libs/remote_storage/src/local_fs.rs           |  14 ++
 libs/remote_storage/src/metrics.rs            |   9 +-
 libs/remote_storage/src/s3_bucket.rs          |  74 +++++++-
 libs/remote_storage/src/simulate_failures.rs  |  11 ++
 storage_scrubber/src/checks.rs                |  85 ++++-----
 storage_scrubber/src/lib.rs                   |  69 ++++---
 storage_scrubber/src/metadata_stream.rs       | 170 ++++--------------
 .../src/pageserver_physical_gc.rs             | 134 +++++---------
 .../src/scan_pageserver_metadata.rs           |  40 ++---
 .../src/scan_safekeeper_metadata.rs           |   7 +-
 storage_scrubber/src/tenant_snapshot.rs       |  27 +--
 test_runner/fixtures/neon_fixtures.py         |   1 +
 14 files changed, 366 insertions(+), 341 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 3c77d5a227..cb7479f6cd 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -383,6 +383,48 @@ impl RemoteStorage for AzureBlobStorage {
         }
     }
 
+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError> {
+        let kind = RequestKind::Head;
+        let _permit = self.permit(kind, cancel).await?;
+
+        let started_at = start_measuring_requests(kind);
+
+        let blob_client = self.client.blob_client(self.relative_path_to_name(key));
+        let properties_future = blob_client.get_properties().into_future();
+
+        let properties_future = tokio::time::timeout(self.timeout, properties_future);
+
+        let res = tokio::select! {
+            res = properties_future => res,
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
+
+        if let Ok(inner) = &res {
+            // do not incl. timeouts as errors in metrics but cancellations
+            let started_at = ScopeGuard::into_inner(started_at);
+            crate::metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, inner, started_at);
+        }
+
+        let data = match res {
+            Ok(Ok(data)) => Ok(data),
+            Ok(Err(sdk)) => Err(to_download_error(sdk)),
+            Err(_timeout) => Err(DownloadError::Timeout),
+        }?;
+
+        let properties = data.blob.properties;
+        Ok(ListingObject {
+            key: key.to_owned(),
+            last_modified: SystemTime::from(properties.last_modified),
+            size: properties.content_length,
+        })
+    }
+
     async fn upload(
         &self,
         from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 2c9e298f79..cc1d3e0ae4 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -150,7 +150,7 @@ pub enum ListingMode {
     NoDelimiter,
 }
 
-#[derive(PartialEq, Eq, Debug)]
+#[derive(PartialEq, Eq, Debug, Clone)]
 pub struct ListingObject {
     pub key: RemotePath,
     pub last_modified: SystemTime,
@@ -215,6 +215,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
         Ok(combined)
     }
 
+    /// Obtain metadata information about an object.
+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError>;
+
     /// Streams the local file contents into remote into the remote storage entry.
     ///
     /// If the operation fails because of timeout or cancellation, the root cause of the error will be
@@ -363,6 +370,20 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         }
     }
 
+    // See [`RemoteStorage::head_object`].
+    pub async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError> {
+        match self {
+            Self::LocalFs(s) => s.head_object(key, cancel).await,
+            Self::AwsS3(s) => s.head_object(key, cancel).await,
+            Self::AzureBlob(s) => s.head_object(key, cancel).await,
+            Self::Unreliable(s) => s.head_object(key, cancel).await,
+        }
+    }
+
     /// See [`RemoteStorage::upload`]
     pub async fn upload(
         &self,
@@ -598,6 +619,7 @@ impl ConcurrencyLimiter {
             RequestKind::Delete => &self.write,
             RequestKind::Copy => &self.write,
             RequestKind::TimeTravel => &self.write,
+            RequestKind::Head => &self.read,
         }
     }
 
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 99b4aa4061..c3ef18cab1 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -445,6 +445,20 @@ impl RemoteStorage for LocalFs {
         }
     }
 
+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        _cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError> {
+        let target_file_path = key.with_base(&self.storage_root);
+        let metadata = file_metadata(&target_file_path).await?;
+        Ok(ListingObject {
+            key: key.clone(),
+            last_modified: metadata.modified()?,
+            size: metadata.len(),
+        })
+    }
+
     async fn upload(
         &self,
         data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
diff --git a/libs/remote_storage/src/metrics.rs b/libs/remote_storage/src/metrics.rs
index bbb51590f3..f1aa4c433b 100644
--- a/libs/remote_storage/src/metrics.rs
+++ b/libs/remote_storage/src/metrics.rs
@@ -13,6 +13,7 @@ pub(crate) enum RequestKind {
     List = 3,
     Copy = 4,
     TimeTravel = 5,
+    Head = 6,
 }
 
 use scopeguard::ScopeGuard;
@@ -27,6 +28,7 @@ impl RequestKind {
             List => "list_objects",
             Copy => "copy_object",
             TimeTravel => "time_travel_recover",
+            Head => "head_object",
         }
     }
     const fn as_index(&self) -> usize {
@@ -34,7 +36,8 @@ impl RequestKind {
     }
 }
 
-pub(crate) struct RequestTyped<C>([C; 6]);
+const REQUEST_KIND_COUNT: usize = 7;
+pub(crate) struct RequestTyped<C>([C; REQUEST_KIND_COUNT]);
 
 impl<C> RequestTyped<C> {
     pub(crate) fn get(&self, kind: RequestKind) -> &C {
@@ -43,8 +46,8 @@ impl<C> RequestTyped<C> {
 
     fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
         use RequestKind::*;
-        let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
-        let arr = std::array::from_fn::<C, 6, _>(|index| {
+        let mut it = [Get, Put, Delete, List, Copy, TimeTravel, Head].into_iter();
+        let arr = std::array::from_fn::<C, REQUEST_KIND_COUNT, _>(|index| {
             let next = it.next().unwrap();
             assert_eq!(index, next.as_index());
             f(next)
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 1f25da813d..11f6598cbf 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -23,7 +23,7 @@ use aws_config::{
 use aws_sdk_s3::{
     config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
     error::SdkError,
-    operation::get_object::GetObjectError,
+    operation::{get_object::GetObjectError, head_object::HeadObjectError},
     types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
     Client,
 };
@@ -604,6 +604,78 @@ impl RemoteStorage for S3Bucket {
         }
     }
 
+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<ListingObject, DownloadError> {
+        let kind = RequestKind::Head;
+        let _permit = self.permit(kind, cancel).await?;
+
+        let started_at = start_measuring_requests(kind);
+
+        let head_future = self
+            .client
+            .head_object()
+            .bucket(self.bucket_name())
+            .key(self.relative_path_to_s3_object(key))
+            .send();
+
+        let head_future = tokio::time::timeout(self.timeout, head_future);
+
+        let res = tokio::select! {
+            res = head_future => res,
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
+
+        let res = res.map_err(|_e| DownloadError::Timeout)?;
+
+        // do not incl. timeouts as errors in metrics but cancellations
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+
+        let data = match res {
+            Ok(object_output) => object_output,
+            Err(SdkError::ServiceError(e)) if matches!(e.err(), HeadObjectError::NotFound(_)) => {
+                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
+                // an error: we expect to sometimes fetch an object and find it missing,
+                // e.g. when probing for timeline indices.
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Ok,
+                    started_at,
+                );
+                return Err(DownloadError::NotFound);
+            }
+            Err(e) => {
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Err,
+                    started_at,
+                );
+
+                return Err(DownloadError::Other(
+                    anyhow::Error::new(e).context("s3 head object"),
+                ));
+            }
+        };
+
+        let (Some(last_modified), Some(size)) = (data.last_modified, data.content_length) else {
+            return Err(DownloadError::Other(anyhow!(
+                "head_object doesn't contain last_modified or content_length"
+            )))?;
+        };
+        Ok(ListingObject {
+            key: key.to_owned(),
+            last_modified: SystemTime::try_from(last_modified).map_err(|e| {
+                DownloadError::Other(anyhow!("can't convert time '{last_modified}': {e}"))
+            })?,
+            size: size as u64,
+        })
+    }
+
     async fn upload(
         &self,
         from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 13f873dcdb..c7eb634af3 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -30,6 +30,7 @@ pub struct UnreliableWrapper {
 #[derive(Debug, Hash, Eq, PartialEq)]
 enum RemoteOp {
     ListPrefixes(Option<RemotePath>),
+    HeadObject(RemotePath),
     Upload(RemotePath),
     Download(RemotePath),
     Delete(RemotePath),
@@ -137,6 +138,16 @@ impl RemoteStorage for UnreliableWrapper {
         self.inner.list(prefix, mode, max_keys, cancel).await
     }
 
+    async fn head_object(
+        &self,
+        key: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<crate::ListingObject, DownloadError> {
+        self.attempt(RemoteOp::HeadObject(key.clone()))
+            .map_err(DownloadError::Other)?;
+        self.inner.head_object(key, cancel).await
+    }
+
     async fn upload(
         &self,
         data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 35ec69fd50..9063b3c197 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -1,22 +1,22 @@
 use std::collections::{HashMap, HashSet};
 
 use anyhow::Context;
-use aws_sdk_s3::Client;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
+use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};
 use utils::generation::Generation;
 use utils::id::TimelineId;
 
 use crate::cloud_admin_api::BranchData;
-use crate::metadata_stream::stream_listing;
-use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
+use crate::metadata_stream::stream_listing_generic;
+use crate::{download_object_with_retries_generic, RootTarget, TenantShardTimelineId};
 use futures_util::StreamExt;
 use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
-use remote_storage::RemotePath;
+use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath};
 
 pub(crate) struct TimelineAnalysis {
     /// Anomalies detected
@@ -48,13 +48,12 @@ impl TimelineAnalysis {
 }
 
 pub(crate) async fn branch_cleanup_and_check_errors(
-    s3_client: &Client,
-    target: &RootTarget,
+    remote_client: &GenericRemoteStorage,
     id: &TenantShardTimelineId,
     tenant_objects: &mut TenantObjectListing,
     s3_active_branch: Option<&BranchData>,
     console_branch: Option<BranchData>,
-    s3_data: Option<S3TimelineBlobData>,
+    s3_data: Option<RemoteTimelineBlobData>,
 ) -> TimelineAnalysis {
     let mut result = TimelineAnalysis::new();
 
@@ -78,7 +77,9 @@ pub(crate) async fn branch_cleanup_and_check_errors(
 
     match s3_data {
         Some(s3_data) => {
-            result.garbage_keys.extend(s3_data.unknown_keys);
+            result
+                .garbage_keys
+                .extend(s3_data.unknown_keys.into_iter().map(|k| k.key.to_string()));
 
             match s3_data.blob_data {
                 BlobDataParseResult::Parsed {
@@ -143,11 +144,8 @@ pub(crate) async fn branch_cleanup_and_check_errors(
 
                             // HEAD request used here to address a race condition  when an index was uploaded concurrently
                             // with our scan. We check if the object is uploaded to S3 after taking the listing snapshot.
-                            let response = s3_client
-                                .head_object()
-                                .bucket(target.bucket_name())
-                                .key(path.get_path().as_str())
-                                .send()
+                            let response = remote_client
+                                .head_object(&path, &CancellationToken::new())
                                 .await;
 
                             if response.is_err() {
@@ -284,14 +282,14 @@ impl TenantObjectListing {
 }
 
 #[derive(Debug)]
-pub(crate) struct S3TimelineBlobData {
+pub(crate) struct RemoteTimelineBlobData {
     pub(crate) blob_data: BlobDataParseResult,
 
     // Index objects that were not used when loading `blob_data`, e.g. those from old generations
-    pub(crate) unused_index_keys: Vec<String>,
+    pub(crate) unused_index_keys: Vec<ListingObject>,
 
     // Objects whose keys were not recognized at all, i.e. not layer files, not indices
-    pub(crate) unknown_keys: Vec<String>,
+    pub(crate) unknown_keys: Vec<ListingObject>,
 }
 
 #[derive(Debug)]
@@ -322,11 +320,11 @@ pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generati
     }
 }
 
-pub(crate) async fn list_timeline_blobs(
-    s3_client: &Client,
+pub(crate) async fn list_timeline_blobs_generic(
+    remote_client: &GenericRemoteStorage,
     id: TenantShardTimelineId,
     s3_root: &RootTarget,
-) -> anyhow::Result<S3TimelineBlobData> {
+) -> anyhow::Result<RemoteTimelineBlobData> {
     let mut s3_layers = HashSet::new();
 
     let mut errors = Vec::new();
@@ -335,19 +333,25 @@ pub(crate) async fn list_timeline_blobs(
     let mut timeline_dir_target = s3_root.timeline_root(&id);
     timeline_dir_target.delimiter = String::new();
 
-    let mut index_part_keys: Vec<String> = Vec::new();
+    let mut index_part_keys: Vec<ListingObject> = Vec::new();
     let mut initdb_archive: bool = false;
 
-    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
-    while let Some(obj) = stream.next().await {
-        let obj = obj?;
-        let key = obj.key();
+    let prefix_str = &timeline_dir_target
+        .prefix_in_bucket
+        .strip_prefix("/")
+        .unwrap_or(&timeline_dir_target.prefix_in_bucket);
 
-        let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket);
+    let mut stream = std::pin::pin!(stream_listing_generic(remote_client, &timeline_dir_target));
+    while let Some(obj) = stream.next().await {
+        let (key, Some(obj)) = obj? else {
+            panic!("ListingObject not specified");
+        };
+
+        let blob_name = key.get_path().as_str().strip_prefix(prefix_str);
         match blob_name {
             Some(name) if name.starts_with("index_part.json") => {
                 tracing::debug!("Index key {key}");
-                index_part_keys.push(key.to_owned())
+                index_part_keys.push(obj)
             }
             Some("initdb.tar.zst") => {
                 tracing::debug!("initdb archive {key}");
@@ -358,7 +362,7 @@ pub(crate) async fn list_timeline_blobs(
             }
             Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) {
                 Ok((new_layer, gen)) => {
-                    tracing::debug!("Parsed layer key: {} {:?}", new_layer, gen);
+                    tracing::debug!("Parsed layer key: {new_layer} {gen:?}");
                     s3_layers.insert((new_layer, gen));
                 }
                 Err(e) => {
@@ -366,13 +370,13 @@ pub(crate) async fn list_timeline_blobs(
                     errors.push(
                         format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
                     );
-                    unknown_keys.push(key.to_string());
+                    unknown_keys.push(obj);
                 }
             },
             None => {
-                tracing::warn!("Unknown key {}", key);
+                tracing::warn!("Unknown key {key}");
                 errors.push(format!("S3 list response got an object with odd key {key}"));
-                unknown_keys.push(key.to_string());
+                unknown_keys.push(obj);
             }
         }
     }
@@ -381,7 +385,7 @@ pub(crate) async fn list_timeline_blobs(
         tracing::debug!(
             "Timeline is empty apart from initdb archive: expected post-deletion state."
         );
-        return Ok(S3TimelineBlobData {
+        return Ok(RemoteTimelineBlobData {
             blob_data: BlobDataParseResult::Relic,
             unused_index_keys: index_part_keys,
             unknown_keys: Vec::new(),
@@ -395,13 +399,13 @@ pub(crate) async fn list_timeline_blobs(
             // Stripping the index key to the last part, because RemotePath doesn't
             // like absolute paths, and depending on prefix_in_bucket it's possible
             // for the keys we read back to start with a slash.
-            let basename = key.rsplit_once('/').unwrap().1;
+            let basename = key.key.get_path().as_str().rsplit_once('/').unwrap().1;
             parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (key, g))
         })
         .max_by_key(|i| i.1)
         .map(|(k, g)| (k.clone(), g))
     {
-        Some((key, gen)) => (Some(key), gen),
+        Some((key, gen)) => (Some::<ListingObject>(key.to_owned()), gen),
         None => {
             // Legacy/missing case: one or zero index parts, which did not have a generation
             (index_part_keys.pop(), Generation::none())
@@ -416,17 +420,14 @@ pub(crate) async fn list_timeline_blobs(
     }
 
     if let Some(index_part_object_key) = index_part_object.as_ref() {
-        let index_part_bytes = download_object_with_retries(
-            s3_client,
-            &timeline_dir_target.bucket_name,
-            index_part_object_key,
-        )
-        .await
-        .context("index_part.json download")?;
+        let index_part_bytes =
+            download_object_with_retries_generic(remote_client, &index_part_object_key.key)
+                .await
+                .context("index_part.json download")?;
 
         match serde_json::from_slice(&index_part_bytes) {
             Ok(index_part) => {
-                return Ok(S3TimelineBlobData {
+                return Ok(RemoteTimelineBlobData {
                     blob_data: BlobDataParseResult::Parsed {
                         index_part: Box::new(index_part),
                         index_part_generation,
@@ -448,7 +449,7 @@ pub(crate) async fn list_timeline_blobs(
         );
     }
 
-    Ok(S3TimelineBlobData {
+    Ok(RemoteTimelineBlobData {
         blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
         unused_index_keys: index_part_keys,
         unknown_keys,
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 1fc94cc174..3183bc3c64 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -452,23 +452,26 @@ fn stream_objects_with_retries<'a>(
         let mut list_stream =
             storage_client.list_streaming(Some(&prefix), listing_mode, None, &cancel);
         while let Some(res) = list_stream.next().await {
-            if let Err(err) = res {
-                let yield_err = if err.is_permanent() {
-                    true
-                } else {
-                    let backoff_time = 1 << trial.max(5);
-                    tokio::time::sleep(Duration::from_secs(backoff_time)).await;
-                    trial += 1;
-                    trial == MAX_RETRIES - 1
-                };
-                if yield_err {
-                    yield Err(err)
-                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
-                    break;
+            match res {
+                Err(err) => {
+                    let yield_err = if err.is_permanent() {
+                        true
+                    } else {
+                        let backoff_time = 1 << trial.max(5);
+                        tokio::time::sleep(Duration::from_secs(backoff_time)).await;
+                        trial += 1;
+                        trial == MAX_RETRIES - 1
+                    };
+                    if yield_err {
+                        yield Err(err)
+                            .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
+                        break;
+                    }
+                }
+                Ok(res) => {
+                    trial = 0;
+                    yield Ok(res);
                 }
-            } else {
-                trial = 0;
-                yield res.map_err(anyhow::Error::from);
             }
         }
     }
@@ -513,41 +516,35 @@ async fn list_objects_with_retries_generic(
     panic!("MAX_RETRIES is not allowed to be 0");
 }
 
-async fn download_object_with_retries(
-    s3_client: &Client,
-    bucket_name: &str,
-    key: &str,
+async fn download_object_with_retries_generic(
+    remote_client: &GenericRemoteStorage,
+    key: &RemotePath,
 ) -> anyhow::Result<Vec<u8>> {
-    for _ in 0..MAX_RETRIES {
-        let mut body_buf = Vec::new();
-        let response_stream = match s3_client
-            .get_object()
-            .bucket(bucket_name)
-            .key(key)
-            .send()
-            .await
-        {
+    let cancel = CancellationToken::new();
+    for trial in 0..MAX_RETRIES {
+        let mut buf = Vec::new();
+        let download = match remote_client.download(key, &cancel).await {
             Ok(response) => response,
             Err(e) => {
                 error!("Failed to download object for key {key}: {e}");
-                tokio::time::sleep(Duration::from_secs(1)).await;
+                let backoff_time = 1 << trial.max(5);
+                tokio::time::sleep(Duration::from_secs(backoff_time)).await;
                 continue;
             }
         };
 
-        match response_stream
-            .body
-            .into_async_read()
-            .read_to_end(&mut body_buf)
+        match tokio_util::io::StreamReader::new(download.download_stream)
+            .read_to_end(&mut buf)
             .await
         {
             Ok(bytes_read) => {
                 tracing::debug!("Downloaded {bytes_read} bytes for object {key}");
-                return Ok(body_buf);
+                return Ok(buf);
             }
             Err(e) => {
                 error!("Failed to stream object body for key {key}: {e}");
-                tokio::time::sleep(Duration::from_secs(1)).await;
+                let backoff_time = 1 << trial.max(5);
+                tokio::time::sleep(Duration::from_secs(backoff_time)).await;
             }
         }
     }
diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
index 54812ffc94..eca774413a 100644
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -2,14 +2,14 @@ use std::str::FromStr;
 
 use anyhow::{anyhow, Context};
 use async_stream::{stream, try_stream};
-use aws_sdk_s3::{types::ObjectIdentifier, Client};
+use aws_sdk_s3::Client;
 use futures::StreamExt;
 use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath};
 use tokio_stream::Stream;
 
 use crate::{
-    list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target,
-    TenantShardTimelineId,
+    list_objects_with_retries, list_objects_with_retries_generic, stream_objects_with_retries,
+    RootTarget, S3Target, TenantShardTimelineId,
 };
 use pageserver_api::shard::TenantShardId;
 use utils::id::{TenantId, TimelineId};
@@ -75,53 +75,38 @@ pub fn stream_tenants<'a>(
 }
 
 pub async fn stream_tenant_shards<'a>(
-    s3_client: &'a Client,
+    remote_client: &'a GenericRemoteStorage,
     target: &'a RootTarget,
     tenant_id: TenantId,
 ) -> anyhow::Result<impl Stream<Item = Result<TenantShardId, anyhow::Error>> + 'a> {
-    let mut tenant_shard_ids: Vec<Result<TenantShardId, anyhow::Error>> = Vec::new();
-    let mut continuation_token = None;
     let shards_target = target.tenant_shards_prefix(&tenant_id);
 
-    loop {
-        tracing::info!("Listing in {}", shards_target.prefix_in_bucket);
-        let fetch_response =
-            list_objects_with_retries(s3_client, &shards_target, continuation_token.clone()).await;
-        let fetch_response = match fetch_response {
-            Err(e) => {
-                tenant_shard_ids.push(Err(e));
-                break;
-            }
-            Ok(r) => r,
-        };
+    let strip_prefix = target.tenants_root().prefix_in_bucket;
+    let prefix_str = &strip_prefix.strip_prefix("/").unwrap_or(&strip_prefix);
 
-        let new_entry_ids = fetch_response
-            .common_prefixes()
-            .iter()
-            .filter_map(|prefix| prefix.prefix())
-            .filter_map(|prefix| -> Option<&str> {
-                prefix
-                    .strip_prefix(&target.tenants_root().prefix_in_bucket)?
-                    .strip_suffix('/')
-            })
-            .map(|entry_id_str| {
-                let first_part = entry_id_str.split('/').next().unwrap();
+    tracing::info!("Listing shards in {}", shards_target.prefix_in_bucket);
+    let listing = list_objects_with_retries_generic(
+        remote_client,
+        ListingMode::WithDelimiter,
+        &shards_target,
+    )
+    .await?;
 
-                first_part
-                    .parse::<TenantShardId>()
-                    .with_context(|| format!("Incorrect entry id str: {first_part}"))
-            });
+    let tenant_shard_ids = listing
+        .prefixes
+        .iter()
+        .map(|prefix| prefix.get_path().as_str())
+        .filter_map(|prefix| -> Option<&str> { prefix.strip_prefix(prefix_str) })
+        .map(|entry_id_str| {
+            let first_part = entry_id_str.split('/').next().unwrap();
 
-        for i in new_entry_ids {
-            tenant_shard_ids.push(i);
-        }
-
-        match fetch_response.next_continuation_token {
-            Some(new_token) => continuation_token = Some(new_token),
-            None => break,
-        }
-    }
+            first_part
+                .parse::<TenantShardId>()
+                .with_context(|| format!("Incorrect entry id str: {first_part}"))
+        })
+        .collect::<Vec<_>>();
 
+    tracing::debug!("Yielding {} shards for {tenant_id}", tenant_shard_ids.len());
     Ok(stream! {
         for i in tenant_shard_ids {
             let id = i?;
@@ -130,65 +115,6 @@ pub async fn stream_tenant_shards<'a>(
     })
 }
 
-/// Given a TenantShardId, output a stream of the timelines within that tenant, discovered
-/// using ListObjectsv2.  The listing is done before the stream is built, so that this
-/// function can be used to generate concurrency on a stream using buffer_unordered.
-pub async fn stream_tenant_timelines<'a>(
-    s3_client: &'a Client,
-    target: &'a RootTarget,
-    tenant: TenantShardId,
-) -> anyhow::Result<impl Stream<Item = Result<TenantShardTimelineId, anyhow::Error>> + 'a> {
-    let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
-    let mut continuation_token = None;
-    let timelines_target = target.timelines_root(&tenant);
-
-    loop {
-        tracing::debug!("Listing in {}", tenant);
-        let fetch_response =
-            list_objects_with_retries(s3_client, &timelines_target, continuation_token.clone())
-                .await;
-        let fetch_response = match fetch_response {
-            Err(e) => {
-                timeline_ids.push(Err(e));
-                break;
-            }
-            Ok(r) => r,
-        };
-
-        let new_entry_ids = fetch_response
-            .common_prefixes()
-            .iter()
-            .filter_map(|prefix| prefix.prefix())
-            .filter_map(|prefix| -> Option<&str> {
-                prefix
-                    .strip_prefix(&timelines_target.prefix_in_bucket)?
-                    .strip_suffix('/')
-            })
-            .map(|entry_id_str| {
-                entry_id_str
-                    .parse::<TimelineId>()
-                    .with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
-            });
-
-        for i in new_entry_ids {
-            timeline_ids.push(i);
-        }
-
-        match fetch_response.next_continuation_token {
-            Some(new_token) => continuation_token = Some(new_token),
-            None => break,
-        }
-    }
-
-    tracing::debug!("Yielding for {}", tenant);
-    Ok(stream! {
-        for i in timeline_ids {
-            let id = i?;
-            yield Ok(TenantShardTimelineId::new(tenant, id));
-        }
-    })
-}
-
 /// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
 /// using a listing. The listing is done before the stream is built, so that this
 /// function can be used to generate concurrency on a stream using buffer_unordered.
@@ -200,6 +126,11 @@ pub async fn stream_tenant_timelines_generic<'a>(
     let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
     let timelines_target = target.timelines_root(&tenant);
 
+    let prefix_str = &timelines_target
+        .prefix_in_bucket
+        .strip_prefix("/")
+        .unwrap_or(&timelines_target.prefix_in_bucket);
+
     let mut objects_stream = std::pin::pin!(stream_objects_with_retries(
         remote_client,
         ListingMode::WithDelimiter,
@@ -220,11 +151,7 @@ pub async fn stream_tenant_timelines_generic<'a>(
             .prefixes
             .iter()
             .filter_map(|prefix| -> Option<&str> {
-                prefix
-                    .get_path()
-                    .as_str()
-                    .strip_prefix(&timelines_target.prefix_in_bucket)?
-                    .strip_suffix('/')
+                prefix.get_path().as_str().strip_prefix(prefix_str)
             })
             .map(|entry_id_str| {
                 entry_id_str
@@ -237,7 +164,7 @@ pub async fn stream_tenant_timelines_generic<'a>(
         }
     }
 
-    tracing::debug!("Yielding for {}", tenant);
+    tracing::debug!("Yielding {} timelines for {}", timeline_ids.len(), tenant);
     Ok(stream! {
         for i in timeline_ids {
             let id = i?;
@@ -246,37 +173,6 @@ pub async fn stream_tenant_timelines_generic<'a>(
     })
 }
 
-pub(crate) fn stream_listing<'a>(
-    s3_client: &'a Client,
-    target: &'a S3Target,
-) -> impl Stream<Item = anyhow::Result<ObjectIdentifier>> + 'a {
-    try_stream! {
-        let mut continuation_token = None;
-        loop {
-            let fetch_response =
-                list_objects_with_retries(s3_client, target, continuation_token.clone()).await?;
-
-            if target.delimiter.is_empty() {
-                for object_key in fetch_response.contents().iter().filter_map(|object| object.key())
-                {
-                    let object_id = ObjectIdentifier::builder().key(object_key).build()?;
-                    yield object_id;
-                }
-            } else {
-                for prefix in fetch_response.common_prefixes().iter().filter_map(|p| p.prefix()) {
-                    let object_id = ObjectIdentifier::builder().key(prefix).build()?;
-                    yield object_id;
-                }
-            }
-
-            match fetch_response.next_continuation_token {
-                Some(new_token) => continuation_token = Some(new_token),
-                None => break,
-            }
-        }
-    }
-}
-
 pub(crate) fn stream_listing_generic<'a>(
     remote_client: &'a GenericRemoteStorage,
     target: &'a S3Target,
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 20d9bd6dd4..6828081128 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -1,11 +1,10 @@
 use std::collections::{BTreeMap, BTreeSet, HashMap};
 use std::sync::Arc;
-use std::time::{Duration, SystemTime};
+use std::time::Duration;
 
-use crate::checks::{list_timeline_blobs, BlobDataParseResult};
-use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
-use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
-use aws_sdk_s3::Client;
+use crate::checks::{list_timeline_blobs_generic, BlobDataParseResult};
+use crate::metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic};
+use crate::{init_remote_generic, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
@@ -13,10 +12,11 @@ use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use pageserver_api::controller_api::TenantDescribeResponse;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
-use remote_storage::RemotePath;
+use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath};
 use reqwest::Method;
 use serde::Serialize;
 use storage_controller_client::control_api;
+use tokio_util::sync::CancellationToken;
 use tracing::{info_span, Instrument};
 use utils::generation::Generation;
 use utils::id::{TenantId, TenantTimelineId};
@@ -240,38 +240,13 @@ impl TenantRefAccumulator {
     }
 }
 
-async fn is_old_enough(
-    s3_client: &Client,
-    bucket_config: &BucketConfig,
-    min_age: &Duration,
-    key: &str,
-    summary: &mut GcSummary,
-) -> bool {
+fn is_old_enough(min_age: &Duration, key: &ListingObject, summary: &mut GcSummary) -> bool {
     // Validation: we will only GC indices & layers after a time threshold (e.g. one week) so that during an incident
     // it is easier to read old data for analysis, and easier to roll back shard splits without having to un-delete any objects.
-    let age: Duration = match s3_client
-        .head_object()
-        .bucket(&bucket_config.bucket)
-        .key(key)
-        .send()
-        .await
-    {
-        Ok(response) => match response.last_modified {
-            None => {
-                tracing::warn!("Missing last_modified");
-                summary.remote_storage_errors += 1;
-                return false;
-            }
-            Some(last_modified) => match SystemTime::try_from(last_modified).map(|t| t.elapsed()) {
-                Ok(Ok(e)) => e,
-                Err(_) | Ok(Err(_)) => {
-                    tracing::warn!("Bad last_modified time: {last_modified:?}");
-                    return false;
-                }
-            },
-        },
-        Err(e) => {
-            tracing::warn!("Failed to HEAD {key}: {e}");
+    let age = match key.last_modified.elapsed() {
+        Ok(e) => e,
+        Err(_) => {
+            tracing::warn!("Bad last_modified time: {:?}", key.last_modified);
             summary.remote_storage_errors += 1;
             return false;
         }
@@ -289,17 +264,30 @@ async fn is_old_enough(
     old_enough
 }
 
+/// Same as [`is_old_enough`], but doesn't require a [`ListingObject`] passed to it.
+async fn check_is_old_enough(
+    remote_client: &GenericRemoteStorage,
+    key: &RemotePath,
+    min_age: &Duration,
+    summary: &mut GcSummary,
+) -> Option<bool> {
+    let listing_object = remote_client
+        .head_object(key, &CancellationToken::new())
+        .await
+        .ok()?;
+    Some(is_old_enough(min_age, &listing_object, summary))
+}
+
 async fn maybe_delete_index(
-    s3_client: &Client,
-    bucket_config: &BucketConfig,
+    remote_client: &GenericRemoteStorage,
     min_age: &Duration,
     latest_gen: Generation,
-    key: &str,
+    obj: &ListingObject,
     mode: GcMode,
     summary: &mut GcSummary,
 ) {
     // Validation: we will only delete things that parse cleanly
-    let basename = key.rsplit_once('/').unwrap().1;
+    let basename = obj.key.get_path().file_name().unwrap();
     let candidate_generation =
         match parse_remote_index_path(RemotePath::from_string(basename).unwrap()) {
             Some(g) => g,
@@ -328,7 +316,7 @@ async fn maybe_delete_index(
         return;
     }
 
-    if !is_old_enough(s3_client, bucket_config, min_age, key, summary).await {
+    if !is_old_enough(min_age, obj, summary) {
         return;
     }
 
@@ -338,11 +326,8 @@ async fn maybe_delete_index(
     }
 
     // All validations passed: erase the object
-    match s3_client
-        .delete_object()
-        .bucket(&bucket_config.bucket)
-        .key(key)
-        .send()
+    match remote_client
+        .delete(&obj.key, &CancellationToken::new())
         .await
     {
         Ok(_) => {
@@ -358,8 +343,7 @@ async fn maybe_delete_index(
 
 #[allow(clippy::too_many_arguments)]
 async fn gc_ancestor(
-    s3_client: &Client,
-    bucket_config: &BucketConfig,
+    remote_client: &GenericRemoteStorage,
     root_target: &RootTarget,
     min_age: &Duration,
     ancestor: TenantShardId,
@@ -368,7 +352,7 @@ async fn gc_ancestor(
     summary: &mut GcSummary,
 ) -> anyhow::Result<()> {
     // Scan timelines in the ancestor
-    let timelines = stream_tenant_timelines(s3_client, root_target, ancestor).await?;
+    let timelines = stream_tenant_timelines_generic(remote_client, root_target, ancestor).await?;
     let mut timelines = std::pin::pin!(timelines);
 
     // Build a list of keys to retain
@@ -376,7 +360,7 @@ async fn gc_ancestor(
     while let Some(ttid) = timelines.next().await {
         let ttid = ttid?;
 
-        let data = list_timeline_blobs(s3_client, ttid, root_target).await?;
+        let data = list_timeline_blobs_generic(remote_client, ttid, root_target).await?;
 
         let s3_layers = match data.blob_data {
             BlobDataParseResult::Parsed {
@@ -427,7 +411,8 @@ async fn gc_ancestor(
 
             // We apply a time threshold to GCing objects that are un-referenced: this preserves our ability
             // to roll back a shard split if we have to, by avoiding deleting ancestor layers right away
-            if !is_old_enough(s3_client, bucket_config, min_age, &key, summary).await {
+            let path = RemotePath::from_string(key.strip_prefix("/").unwrap_or(&key)).unwrap();
+            if check_is_old_enough(remote_client, &path, min_age, summary).await != Some(true) {
                 continue;
             }
 
@@ -437,13 +422,7 @@ async fn gc_ancestor(
             }
 
             // All validations passed: erase the object
-            match s3_client
-                .delete_object()
-                .bucket(&bucket_config.bucket)
-                .key(&key)
-                .send()
-                .await
-            {
+            match remote_client.delete(&path, &CancellationToken::new()).await {
                 Ok(_) => {
                     tracing::info!("Successfully deleted unreferenced ancestor layer {key}");
                     summary.ancestor_layers_deleted += 1;
@@ -477,10 +456,11 @@ pub async fn pageserver_physical_gc(
     min_age: Duration,
     mode: GcMode,
 ) -> anyhow::Result<GcSummary> {
-    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
+    let (remote_client, target) =
+        init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
 
     let tenants = if tenant_shard_ids.is_empty() {
-        futures::future::Either::Left(stream_tenants(&s3_client, &target))
+        futures::future::Either::Left(stream_tenants_generic(&remote_client, &target))
     } else {
         futures::future::Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok)))
     };
@@ -493,14 +473,13 @@ pub async fn pageserver_physical_gc(
     let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default()));
 
     // Generate a stream of TenantTimelineId
-    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
+    let timelines = tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, t));
     let timelines = timelines.try_buffered(CONCURRENCY);
     let timelines = timelines.try_flatten();
 
     // Generate a stream of S3TimelineBlobData
     async fn gc_timeline(
-        s3_client: &Client,
-        bucket_config: &BucketConfig,
+        remote_client: &GenericRemoteStorage,
         min_age: &Duration,
         target: &RootTarget,
         mode: GcMode,
@@ -508,7 +487,7 @@ pub async fn pageserver_physical_gc(
         accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
     ) -> anyhow::Result<GcSummary> {
         let mut summary = GcSummary::default();
-        let data = list_timeline_blobs(s3_client, ttid, target).await?;
+        let data = list_timeline_blobs_generic(remote_client, ttid, target).await?;
 
         let (index_part, latest_gen, candidates) = match &data.blob_data {
             BlobDataParseResult::Parsed {
@@ -533,17 +512,9 @@ pub async fn pageserver_physical_gc(
         accumulator.lock().unwrap().update(ttid, index_part);
 
         for key in candidates {
-            maybe_delete_index(
-                s3_client,
-                bucket_config,
-                min_age,
-                latest_gen,
-                &key,
-                mode,
-                &mut summary,
-            )
-            .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, key))
-            .await;
+            maybe_delete_index(remote_client, min_age, latest_gen, &key, mode, &mut summary)
+                .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, %key.key))
+                .await;
         }
 
         Ok(summary)
@@ -554,15 +525,7 @@ pub async fn pageserver_physical_gc(
     // Drain futures for per-shard GC, populating accumulator as a side effect
     {
         let timelines = timelines.map_ok(|ttid| {
-            gc_timeline(
-                &s3_client,
-                bucket_config,
-                &min_age,
-                &target,
-                mode,
-                ttid,
-                &accumulator,
-            )
+            gc_timeline(&remote_client, &min_age, &target, mode, ttid, &accumulator)
         });
         let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
 
@@ -586,8 +549,7 @@ pub async fn pageserver_physical_gc(
 
     for ancestor_shard in ancestor_shards {
         gc_ancestor(
-            &s3_client,
-            bucket_config,
+            &remote_client,
             &target,
             &min_age,
             ancestor_shard,
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index 2409b7b132..e89e97ccb6 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -1,16 +1,16 @@
 use std::collections::{HashMap, HashSet};
 
 use crate::checks::{
-    branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData,
-    TenantObjectListing, TimelineAnalysis,
+    branch_cleanup_and_check_errors, list_timeline_blobs_generic, BlobDataParseResult,
+    RemoteTimelineBlobData, TenantObjectListing, TimelineAnalysis,
 };
-use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
-use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
-use aws_sdk_s3::Client;
+use crate::metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic};
+use crate::{init_remote_generic, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::remote_layer_path;
 use pageserver_api::controller_api::MetadataHealthUpdateRequest;
 use pageserver_api::shard::TenantShardId;
+use remote_storage::GenericRemoteStorage;
 use serde::Serialize;
 use utils::id::TenantId;
 use utils::shard::ShardCount;
@@ -36,7 +36,7 @@ impl MetadataSummary {
         Self::default()
     }
 
-    fn update_data(&mut self, data: &S3TimelineBlobData) {
+    fn update_data(&mut self, data: &RemoteTimelineBlobData) {
         self.timeline_shard_count += 1;
         if let BlobDataParseResult::Parsed {
             index_part,
@@ -120,10 +120,10 @@ pub async fn scan_pageserver_metadata(
     bucket_config: BucketConfig,
     tenant_ids: Vec<TenantShardId>,
 ) -> anyhow::Result<MetadataSummary> {
-    let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver).await?;
+    let (remote_client, target) = init_remote_generic(bucket_config, NodeKind::Pageserver).await?;
 
     let tenants = if tenant_ids.is_empty() {
-        futures::future::Either::Left(stream_tenants(&s3_client, &target))
+        futures::future::Either::Left(stream_tenants_generic(&remote_client, &target))
     } else {
         futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
     };
@@ -133,20 +133,20 @@ pub async fn scan_pageserver_metadata(
     const CONCURRENCY: usize = 32;
 
     // Generate a stream of TenantTimelineId
-    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
+    let timelines = tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, t));
     let timelines = timelines.try_buffered(CONCURRENCY);
     let timelines = timelines.try_flatten();
 
     // Generate a stream of S3TimelineBlobData
     async fn report_on_timeline(
-        s3_client: &Client,
+        remote_client: &GenericRemoteStorage,
         target: &RootTarget,
         ttid: TenantShardTimelineId,
-    ) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> {
-        let data = list_timeline_blobs(s3_client, ttid, target).await?;
+    ) -> anyhow::Result<(TenantShardTimelineId, RemoteTimelineBlobData)> {
+        let data = list_timeline_blobs_generic(remote_client, ttid, target).await?;
         Ok((ttid, data))
     }
-    let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid));
+    let timelines = timelines.map_ok(|ttid| report_on_timeline(&remote_client, &target, ttid));
     let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
 
     // We must gather all the TenantShardTimelineId->S3TimelineBlobData for each tenant, because different
@@ -157,12 +157,11 @@ pub async fn scan_pageserver_metadata(
     let mut tenant_timeline_results = Vec::new();
 
     async fn analyze_tenant(
-        s3_client: &Client,
-        target: &RootTarget,
+        remote_client: &GenericRemoteStorage,
         tenant_id: TenantId,
         summary: &mut MetadataSummary,
         mut tenant_objects: TenantObjectListing,
-        timelines: Vec<(TenantShardTimelineId, S3TimelineBlobData)>,
+        timelines: Vec<(TenantShardTimelineId, RemoteTimelineBlobData)>,
         highest_shard_count: ShardCount,
     ) {
         summary.tenant_count += 1;
@@ -191,8 +190,7 @@ pub async fn scan_pageserver_metadata(
                 // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
                 // reference counts for layers across the tenant.
                 let analysis = branch_cleanup_and_check_errors(
-                    s3_client,
-                    target,
+                    remote_client,
                     &ttid,
                     &mut tenant_objects,
                     None,
@@ -273,8 +271,7 @@ pub async fn scan_pageserver_metadata(
                     let tenant_objects = std::mem::take(&mut tenant_objects);
                     let timelines = std::mem::take(&mut tenant_timeline_results);
                     analyze_tenant(
-                        &s3_client,
-                        &target,
+                        &remote_client,
                         prev_tenant_id,
                         &mut summary,
                         tenant_objects,
@@ -311,8 +308,7 @@ pub async fn scan_pageserver_metadata(
 
     if !tenant_timeline_results.is_empty() {
         analyze_tenant(
-            &s3_client,
-            &target,
+            &remote_client,
             tenant_id.expect("Must be set if results are present"),
             &mut summary,
             tenant_objects,
diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
index 08a4541c5c..f20fa27d13 100644
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -188,6 +188,11 @@ async fn check_timeline(
     // we need files, so unset it.
     timeline_dir_target.delimiter = String::new();
 
+    let prefix_str = &timeline_dir_target
+        .prefix_in_bucket
+        .strip_prefix("/")
+        .unwrap_or(&timeline_dir_target.prefix_in_bucket);
+
     let mut stream = std::pin::pin!(stream_listing_generic(remote_client, &timeline_dir_target));
     while let Some(obj) = stream.next().await {
         let (key, _obj) = obj?;
@@ -195,7 +200,7 @@ async fn check_timeline(
         let seg_name = key
             .get_path()
             .as_str()
-            .strip_prefix(&timeline_dir_target.prefix_in_bucket)
+            .strip_prefix(prefix_str)
             .expect("failed to extract segment name");
         expected_segfiles.remove(seg_name);
     }
diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs
index 1866e6ec80..fc3a973922 100644
--- a/storage_scrubber/src/tenant_snapshot.rs
+++ b/storage_scrubber/src/tenant_snapshot.rs
@@ -1,10 +1,11 @@
 use std::collections::HashMap;
 use std::sync::Arc;
 
-use crate::checks::{list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData};
-use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines};
+use crate::checks::{list_timeline_blobs_generic, BlobDataParseResult, RemoteTimelineBlobData};
+use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines_generic};
 use crate::{
-    download_object_to_file, init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId,
+    download_object_to_file, init_remote, init_remote_generic, BucketConfig, NodeKind, RootTarget,
+    TenantShardTimelineId,
 };
 use anyhow::Context;
 use async_stream::stream;
@@ -15,6 +16,7 @@ use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use pageserver_api::shard::TenantShardId;
+use remote_storage::GenericRemoteStorage;
 use utils::generation::Generation;
 use utils::id::TenantId;
 
@@ -215,11 +217,11 @@ impl SnapshotDownloader {
     }
 
     pub async fn download(&self) -> anyhow::Result<()> {
-        let (s3_client, target) =
-            init_remote(self.bucket_config.clone(), NodeKind::Pageserver).await?;
+        let (remote_client, target) =
+            init_remote_generic(self.bucket_config.clone(), NodeKind::Pageserver).await?;
 
         // Generate a stream of TenantShardId
-        let shards = stream_tenant_shards(&s3_client, &target, self.tenant_id).await?;
+        let shards = stream_tenant_shards(&remote_client, &target, self.tenant_id).await?;
         let shards: Vec<TenantShardId> = shards.try_collect().await?;
 
         // Only read from shards that have the highest count: avoids redundantly downloading
@@ -237,18 +239,19 @@ impl SnapshotDownloader {
 
         for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {
             // Generate a stream of TenantTimelineId
-            let timelines = stream_tenant_timelines(&s3_client, &self.s3_root, shard).await?;
+            let timelines = stream_tenant_timelines_generic(&remote_client, &target, shard).await?;
 
             // Generate a stream of S3TimelineBlobData
             async fn load_timeline_index(
-                s3_client: &Client,
+                remote_client: &GenericRemoteStorage,
                 target: &RootTarget,
                 ttid: TenantShardTimelineId,
-            ) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> {
-                let data = list_timeline_blobs(s3_client, ttid, target).await?;
+            ) -> anyhow::Result<(TenantShardTimelineId, RemoteTimelineBlobData)> {
+                let data = list_timeline_blobs_generic(remote_client, ttid, target).await?;
                 Ok((ttid, data))
             }
-            let timelines = timelines.map_ok(|ttid| load_timeline_index(&s3_client, &target, ttid));
+            let timelines =
+                timelines.map_ok(|ttid| load_timeline_index(&remote_client, &target, ttid));
             let mut timelines = std::pin::pin!(timelines.try_buffered(8));
 
             while let Some(i) = timelines.next().await {
@@ -278,7 +281,7 @@ impl SnapshotDownloader {
 
         for (ttid, layers) in ancestor_layers.into_iter() {
             tracing::info!(
-                "Downloading {} layers from ancvestor timeline {ttid}...",
+                "Downloading {} layers from ancestor timeline {ttid}...",
                 layers.len()
             );
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index ba6fbc003a..9aa275d343 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4643,6 +4643,7 @@ class StorageScrubber:
         ]
         args = base_args + args
 
+        log.info(f"Invoking scrubber command {args} with env: {env}")
         (output_path, stdout, status_code) = subprocess_capture(
             self.log_dir,
             args,

From 6949b45e1795816507f5025a474e15d718e07456 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 19 Aug 2024 17:44:10 +0200
Subject: [PATCH 04/55] Update aws -> infra for repo rename (#8755)

See slack thread:
https://neondb.slack.com/archives/C039YKBRZB4/p1722501766006179
---
 .github/workflows/build_and_test.yml               | 10 +++++-----
 docs/rfcs/033-storage-controller-drain-and-fill.md |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index ee6d3ba005..92fff4ffbc 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -985,10 +985,10 @@ jobs:
           GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
         run: |
           if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
+            gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
             gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}}
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
+            gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
               -f deployPgSniRouter=false \
               -f deployProxy=false \
               -f deployStorage=true \
@@ -998,14 +998,14 @@ jobs:
               -f dockerTag=${{needs.tag.outputs.build-tag}} \
               -f deployPreprodRegion=true
 
-            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
+            gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \
               -f deployStorage=true \
               -f deployStorageBroker=true \
               -f deployStorageController=true \
               -f branch=main \
               -f dockerTag=${{needs.tag.outputs.build-tag}}
           elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
+            gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
               -f deployPgSniRouter=true \
               -f deployProxy=true \
               -f deployStorage=false \
@@ -1015,7 +1015,7 @@ jobs:
               -f dockerTag=${{needs.tag.outputs.build-tag}} \
               -f deployPreprodRegion=true
 
-            gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
+            gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
               -f deployPgSniRouter=true \
               -f deployProxy=true \
               -f branch=main \
diff --git a/docs/rfcs/033-storage-controller-drain-and-fill.md b/docs/rfcs/033-storage-controller-drain-and-fill.md
index 77c84cd2a5..733f7c0bd8 100644
--- a/docs/rfcs/033-storage-controller-drain-and-fill.md
+++ b/docs/rfcs/033-storage-controller-drain-and-fill.md
@@ -14,7 +14,7 @@ picked tenant (which requested on-demand activation) for around 30 seconds
 during the restart at 2024-04-03 16:37 UTC.
 
 Note that lots of shutdowns on loaded pageservers do not finish within the
-[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
+[10 second systemd enforced timeout](https://github.com/neondatabase/infra/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
 and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.
 
 This problem is not yet very acutely felt in storage controller managed pageservers since

From 4b26783c94b582dad20efb49ca2ca842c6f944b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 19 Aug 2024 23:58:47 +0200
Subject: [PATCH 05/55] scrubber: remove _generic postfix and two unused
 functions (#8761)

Removes the `_generic` postfix from the `GenericRemoteStorage` using
APIs, as `remote_storage` is the "default" now, and add a `_s3` postfix
to the remaining APIs using the S3 SDK (only in tenant snapshot). Also,
remove two unused functions: `list_objects_with_retries` and
`stream_tenants functions`.

Part of https://github.com/neondatabase/neon/issues/7547
---
 storage_scrubber/src/checks.rs                | 14 ++---
 storage_scrubber/src/find_large_objects.rs    |  7 +--
 storage_scrubber/src/garbage.rs               | 26 ++++-----
 storage_scrubber/src/lib.rs                   | 51 +++-------------
 storage_scrubber/src/metadata_stream.rs       | 58 +++----------------
 .../src/pageserver_physical_gc.rs             | 19 +++---
 .../src/scan_pageserver_metadata.rs           | 14 ++---
 .../src/scan_safekeeper_metadata.rs           |  9 ++-
 storage_scrubber/src/tenant_snapshot.rs       | 17 +++---
 9 files changed, 67 insertions(+), 148 deletions(-)

diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 9063b3c197..b35838bcf7 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -10,8 +10,8 @@ use utils::generation::Generation;
 use utils::id::TimelineId;
 
 use crate::cloud_admin_api::BranchData;
-use crate::metadata_stream::stream_listing_generic;
-use crate::{download_object_with_retries_generic, RootTarget, TenantShardTimelineId};
+use crate::metadata_stream::stream_listing;
+use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
 use futures_util::StreamExt;
 use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
 use pageserver::tenant::storage_layer::LayerName;
@@ -320,17 +320,17 @@ pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generati
     }
 }
 
-pub(crate) async fn list_timeline_blobs_generic(
+pub(crate) async fn list_timeline_blobs(
     remote_client: &GenericRemoteStorage,
     id: TenantShardTimelineId,
-    s3_root: &RootTarget,
+    root_target: &RootTarget,
 ) -> anyhow::Result<RemoteTimelineBlobData> {
     let mut s3_layers = HashSet::new();
 
     let mut errors = Vec::new();
     let mut unknown_keys = Vec::new();
 
-    let mut timeline_dir_target = s3_root.timeline_root(&id);
+    let mut timeline_dir_target = root_target.timeline_root(&id);
     timeline_dir_target.delimiter = String::new();
 
     let mut index_part_keys: Vec<ListingObject> = Vec::new();
@@ -341,7 +341,7 @@ pub(crate) async fn list_timeline_blobs_generic(
         .strip_prefix("/")
         .unwrap_or(&timeline_dir_target.prefix_in_bucket);
 
-    let mut stream = std::pin::pin!(stream_listing_generic(remote_client, &timeline_dir_target));
+    let mut stream = std::pin::pin!(stream_listing(remote_client, &timeline_dir_target));
     while let Some(obj) = stream.next().await {
         let (key, Some(obj)) = obj? else {
             panic!("ListingObject not specified");
@@ -421,7 +421,7 @@ pub(crate) async fn list_timeline_blobs_generic(
 
     if let Some(index_part_object_key) = index_part_object.as_ref() {
         let index_part_bytes =
-            download_object_with_retries_generic(remote_client, &index_part_object_key.key)
+            download_object_with_retries(remote_client, &index_part_object_key.key)
                 .await
                 .context("index_part.json download")?;
 
diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs
index f5bb7e088a..88e36af560 100644
--- a/storage_scrubber/src/find_large_objects.rs
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -6,7 +6,7 @@ use remote_storage::ListingMode;
 use serde::{Deserialize, Serialize};
 
 use crate::{
-    checks::parse_layer_object_name, init_remote_generic, metadata_stream::stream_tenants_generic,
+    checks::parse_layer_object_name, init_remote, metadata_stream::stream_tenants,
     stream_objects_with_retries, BucketConfig, NodeKind,
 };
 
@@ -50,9 +50,8 @@ pub async fn find_large_objects(
     ignore_deltas: bool,
     concurrency: usize,
 ) -> anyhow::Result<LargeObjectListing> {
-    let (remote_client, target) =
-        init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
-    let tenants = pin!(stream_tenants_generic(&remote_client, &target));
+    let (remote_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
+    let tenants = pin!(stream_tenants(&remote_client, &target));
 
     let objects_stream = tenants.map_ok(|tenant_shard_id| {
         let mut tenant_root = target.tenant_root(&tenant_shard_id);
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index d6a73bf366..3e22960f8d 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -19,8 +19,8 @@ use utils::id::TenantId;
 
 use crate::{
     cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
-    init_remote_generic, list_objects_with_retries_generic,
-    metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic},
+    init_remote, list_objects_with_retries,
+    metadata_stream::{stream_tenant_timelines, stream_tenants},
     BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
 };
 
@@ -153,7 +153,7 @@ async fn find_garbage_inner(
     node_kind: NodeKind,
 ) -> anyhow::Result<GarbageList> {
     // Construct clients for S3 and for Console API
-    let (remote_client, target) = init_remote_generic(bucket_config.clone(), node_kind).await?;
+    let (remote_client, target) = init_remote(bucket_config.clone(), node_kind).await?;
     let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config));
 
     // Build a set of console-known tenants, for quickly eliminating known-active tenants without having
@@ -179,7 +179,7 @@ async fn find_garbage_inner(
 
     // Enumerate Tenants in S3, and check if each one exists in Console
     tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
-    let tenants = stream_tenants_generic(&remote_client, &target);
+    let tenants = stream_tenants(&remote_client, &target);
     let tenants_checked = tenants.map_ok(|t| {
         let api_client = cloud_admin_api_client.clone();
         let console_cache = console_cache.clone();
@@ -237,14 +237,13 @@ async fn find_garbage_inner(
         // Special case: If it's missing in console, check for known bugs that would enable us to conclusively
         // identify it as purge-able anyway
         if console_result.is_none() {
-            let timelines =
-                stream_tenant_timelines_generic(&remote_client, &target, tenant_shard_id)
-                    .await?
-                    .collect::<Vec<_>>()
-                    .await;
+            let timelines = stream_tenant_timelines(&remote_client, &target, tenant_shard_id)
+                .await?
+                .collect::<Vec<_>>()
+                .await;
             if timelines.is_empty() {
                 // No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps
-                let tenant_objects = list_objects_with_retries_generic(
+                let tenant_objects = list_objects_with_retries(
                     &remote_client,
                     ListingMode::WithDelimiter,
                     &target.tenant_root(&tenant_shard_id),
@@ -265,7 +264,7 @@ async fn find_garbage_inner(
 
                 for timeline_r in timelines {
                     let timeline = timeline_r?;
-                    let timeline_objects = list_objects_with_retries_generic(
+                    let timeline_objects = list_objects_with_retries(
                         &remote_client,
                         ListingMode::WithDelimiter,
                         &target.timeline_root(&timeline),
@@ -331,8 +330,7 @@ async fn find_garbage_inner(
 
     // Construct a stream of all timelines within active tenants
     let active_tenants = tokio_stream::iter(active_tenants.iter().map(Ok));
-    let timelines =
-        active_tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, *t));
+    let timelines = active_tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, *t));
     let timelines = timelines.try_buffer_unordered(S3_CONCURRENCY);
     let timelines = timelines.try_flatten();
 
@@ -507,7 +505,7 @@ pub async fn purge_garbage(
     );
 
     let (remote_client, _target) =
-        init_remote_generic(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
+        init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
 
     assert_eq!(
         &garbage_list.bucket_config.bucket,
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 3183bc3c64..112f052e07 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -15,7 +15,7 @@ use std::fmt::Display;
 use std::sync::Arc;
 use std::time::Duration;
 
-use anyhow::{anyhow, Context};
+use anyhow::Context;
 use aws_config::retry::{RetryConfigBuilder, RetryMode};
 use aws_sdk_s3::config::Region;
 use aws_sdk_s3::error::DisplayErrorContext;
@@ -352,7 +352,7 @@ fn make_root_target(
     }
 }
 
-async fn init_remote(
+async fn init_remote_s3(
     bucket_config: BucketConfig,
     node_kind: NodeKind,
 ) -> anyhow::Result<(Arc<Client>, RootTarget)> {
@@ -369,7 +369,7 @@ async fn init_remote(
     Ok((s3_client, s3_root))
 }
 
-async fn init_remote_generic(
+async fn init_remote(
     bucket_config: BucketConfig,
     node_kind: NodeKind,
 ) -> anyhow::Result<(GenericRemoteStorage, RootTarget)> {
@@ -394,45 +394,10 @@ async fn init_remote_generic(
 
     // We already pass the prefix to the remote client above
     let prefix_in_root_target = String::new();
-    let s3_root = make_root_target(bucket_config.bucket, prefix_in_root_target, node_kind);
+    let root_target = make_root_target(bucket_config.bucket, prefix_in_root_target, node_kind);
 
     let client = GenericRemoteStorage::from_config(&storage_config).await?;
-    Ok((client, s3_root))
-}
-
-async fn list_objects_with_retries(
-    s3_client: &Client,
-    s3_target: &S3Target,
-    continuation_token: Option<String>,
-) -> anyhow::Result<aws_sdk_s3::operation::list_objects_v2::ListObjectsV2Output> {
-    for trial in 0..MAX_RETRIES {
-        match s3_client
-            .list_objects_v2()
-            .bucket(&s3_target.bucket_name)
-            .prefix(&s3_target.prefix_in_bucket)
-            .delimiter(&s3_target.delimiter)
-            .set_continuation_token(continuation_token.clone())
-            .send()
-            .await
-        {
-            Ok(response) => return Ok(response),
-            Err(e) => {
-                if trial == MAX_RETRIES - 1 {
-                    return Err(e)
-                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
-                }
-                error!(
-                    "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}",
-                    s3_target.bucket_name,
-                    s3_target.prefix_in_bucket,
-                    s3_target.delimiter,
-                    DisplayErrorContext(e),
-                );
-                tokio::time::sleep(Duration::from_secs(1)).await;
-            }
-        }
-    }
-    Err(anyhow!("unreachable unless MAX_RETRIES==0"))
+    Ok((client, root_target))
 }
 
 /// Listing possibly large amounts of keys in a streaming fashion.
@@ -479,7 +444,7 @@ fn stream_objects_with_retries<'a>(
 
 /// If you want to list a bounded amount of prefixes or keys. For larger numbers of keys/prefixes,
 /// use [`stream_objects_with_retries`] instead.
-async fn list_objects_with_retries_generic(
+async fn list_objects_with_retries(
     remote_client: &GenericRemoteStorage,
     listing_mode: ListingMode,
     s3_target: &S3Target,
@@ -516,7 +481,7 @@ async fn list_objects_with_retries_generic(
     panic!("MAX_RETRIES is not allowed to be 0");
 }
 
-async fn download_object_with_retries_generic(
+async fn download_object_with_retries(
     remote_client: &GenericRemoteStorage,
     key: &RemotePath,
 ) -> anyhow::Result<Vec<u8>> {
@@ -552,7 +517,7 @@ async fn download_object_with_retries_generic(
     anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times")
 }
 
-async fn download_object_to_file(
+async fn download_object_to_file_s3(
     s3_client: &Client,
     bucket_name: &str,
     key: &str,
diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
index eca774413a..10d77937f1 100644
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -2,20 +2,19 @@ use std::str::FromStr;
 
 use anyhow::{anyhow, Context};
 use async_stream::{stream, try_stream};
-use aws_sdk_s3::Client;
 use futures::StreamExt;
 use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath};
 use tokio_stream::Stream;
 
 use crate::{
-    list_objects_with_retries, list_objects_with_retries_generic, stream_objects_with_retries,
-    RootTarget, S3Target, TenantShardTimelineId,
+    list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target,
+    TenantShardTimelineId,
 };
 use pageserver_api::shard::TenantShardId;
 use utils::id::{TenantId, TimelineId};
 
 /// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes
-pub fn stream_tenants_generic<'a>(
+pub fn stream_tenants<'a>(
     remote_client: &'a GenericRemoteStorage,
     target: &'a RootTarget,
 ) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
@@ -36,44 +35,6 @@ pub fn stream_tenants_generic<'a>(
     }
 }
 
-/// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
-pub fn stream_tenants<'a>(
-    s3_client: &'a Client,
-    target: &'a RootTarget,
-) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
-    try_stream! {
-        let mut continuation_token = None;
-        let tenants_target = target.tenants_root();
-        loop {
-            let fetch_response =
-                list_objects_with_retries(s3_client, &tenants_target, continuation_token.clone()).await?;
-
-            let new_entry_ids = fetch_response
-                .common_prefixes()
-                .iter()
-                .filter_map(|prefix| prefix.prefix())
-                .filter_map(|prefix| -> Option<&str> {
-                    prefix
-                        .strip_prefix(&tenants_target.prefix_in_bucket)?
-                        .strip_suffix('/')
-                }).map(|entry_id_str| {
-                entry_id_str
-                    .parse()
-                    .with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
-            });
-
-            for i in new_entry_ids {
-                yield i?;
-            }
-
-            match fetch_response.next_continuation_token {
-                Some(new_token) => continuation_token = Some(new_token),
-                None => break,
-            }
-        }
-    }
-}
-
 pub async fn stream_tenant_shards<'a>(
     remote_client: &'a GenericRemoteStorage,
     target: &'a RootTarget,
@@ -85,12 +46,9 @@ pub async fn stream_tenant_shards<'a>(
     let prefix_str = &strip_prefix.strip_prefix("/").unwrap_or(&strip_prefix);
 
     tracing::info!("Listing shards in {}", shards_target.prefix_in_bucket);
-    let listing = list_objects_with_retries_generic(
-        remote_client,
-        ListingMode::WithDelimiter,
-        &shards_target,
-    )
-    .await?;
+    let listing =
+        list_objects_with_retries(remote_client, ListingMode::WithDelimiter, &shards_target)
+            .await?;
 
     let tenant_shard_ids = listing
         .prefixes
@@ -118,7 +76,7 @@ pub async fn stream_tenant_shards<'a>(
 /// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
 /// using a listing. The listing is done before the stream is built, so that this
 /// function can be used to generate concurrency on a stream using buffer_unordered.
-pub async fn stream_tenant_timelines_generic<'a>(
+pub async fn stream_tenant_timelines<'a>(
     remote_client: &'a GenericRemoteStorage,
     target: &'a RootTarget,
     tenant: TenantShardId,
@@ -173,7 +131,7 @@ pub async fn stream_tenant_timelines_generic<'a>(
     })
 }
 
-pub(crate) fn stream_listing_generic<'a>(
+pub(crate) fn stream_listing<'a>(
     remote_client: &'a GenericRemoteStorage,
     target: &'a S3Target,
 ) -> impl Stream<Item = anyhow::Result<(RemotePath, Option<ListingObject>)>> + 'a {
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 6828081128..88681e38c2 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -2,9 +2,9 @@ use std::collections::{BTreeMap, BTreeSet, HashMap};
 use std::sync::Arc;
 use std::time::Duration;
 
-use crate::checks::{list_timeline_blobs_generic, BlobDataParseResult};
-use crate::metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic};
-use crate::{init_remote_generic, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
+use crate::checks::{list_timeline_blobs, BlobDataParseResult};
+use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
+use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
@@ -352,7 +352,7 @@ async fn gc_ancestor(
     summary: &mut GcSummary,
 ) -> anyhow::Result<()> {
     // Scan timelines in the ancestor
-    let timelines = stream_tenant_timelines_generic(remote_client, root_target, ancestor).await?;
+    let timelines = stream_tenant_timelines(remote_client, root_target, ancestor).await?;
     let mut timelines = std::pin::pin!(timelines);
 
     // Build a list of keys to retain
@@ -360,7 +360,7 @@ async fn gc_ancestor(
     while let Some(ttid) = timelines.next().await {
         let ttid = ttid?;
 
-        let data = list_timeline_blobs_generic(remote_client, ttid, root_target).await?;
+        let data = list_timeline_blobs(remote_client, ttid, root_target).await?;
 
         let s3_layers = match data.blob_data {
             BlobDataParseResult::Parsed {
@@ -456,11 +456,10 @@ pub async fn pageserver_physical_gc(
     min_age: Duration,
     mode: GcMode,
 ) -> anyhow::Result<GcSummary> {
-    let (remote_client, target) =
-        init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
+    let (remote_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
 
     let tenants = if tenant_shard_ids.is_empty() {
-        futures::future::Either::Left(stream_tenants_generic(&remote_client, &target))
+        futures::future::Either::Left(stream_tenants(&remote_client, &target))
     } else {
         futures::future::Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok)))
     };
@@ -473,7 +472,7 @@ pub async fn pageserver_physical_gc(
     let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default()));
 
     // Generate a stream of TenantTimelineId
-    let timelines = tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, t));
+    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t));
     let timelines = timelines.try_buffered(CONCURRENCY);
     let timelines = timelines.try_flatten();
 
@@ -487,7 +486,7 @@ pub async fn pageserver_physical_gc(
         accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
     ) -> anyhow::Result<GcSummary> {
         let mut summary = GcSummary::default();
-        let data = list_timeline_blobs_generic(remote_client, ttid, target).await?;
+        let data = list_timeline_blobs(remote_client, ttid, target).await?;
 
         let (index_part, latest_gen, candidates) = match &data.blob_data {
             BlobDataParseResult::Parsed {
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index e89e97ccb6..151ef27672 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -1,11 +1,11 @@
 use std::collections::{HashMap, HashSet};
 
 use crate::checks::{
-    branch_cleanup_and_check_errors, list_timeline_blobs_generic, BlobDataParseResult,
+    branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult,
     RemoteTimelineBlobData, TenantObjectListing, TimelineAnalysis,
 };
-use crate::metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic};
-use crate::{init_remote_generic, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
+use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
+use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::remote_layer_path;
 use pageserver_api::controller_api::MetadataHealthUpdateRequest;
@@ -120,10 +120,10 @@ pub async fn scan_pageserver_metadata(
     bucket_config: BucketConfig,
     tenant_ids: Vec<TenantShardId>,
 ) -> anyhow::Result<MetadataSummary> {
-    let (remote_client, target) = init_remote_generic(bucket_config, NodeKind::Pageserver).await?;
+    let (remote_client, target) = init_remote(bucket_config, NodeKind::Pageserver).await?;
 
     let tenants = if tenant_ids.is_empty() {
-        futures::future::Either::Left(stream_tenants_generic(&remote_client, &target))
+        futures::future::Either::Left(stream_tenants(&remote_client, &target))
     } else {
         futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
     };
@@ -133,7 +133,7 @@ pub async fn scan_pageserver_metadata(
     const CONCURRENCY: usize = 32;
 
     // Generate a stream of TenantTimelineId
-    let timelines = tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, t));
+    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t));
     let timelines = timelines.try_buffered(CONCURRENCY);
     let timelines = timelines.try_flatten();
 
@@ -143,7 +143,7 @@ pub async fn scan_pageserver_metadata(
         target: &RootTarget,
         ttid: TenantShardTimelineId,
     ) -> anyhow::Result<(TenantShardTimelineId, RemoteTimelineBlobData)> {
-        let data = list_timeline_blobs_generic(remote_client, ttid, target).await?;
+        let data = list_timeline_blobs(remote_client, ttid, target).await?;
         Ok((ttid, data))
     }
     let timelines = timelines.map_ok(|ttid| report_on_timeline(&remote_client, &target, ttid));
diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
index f20fa27d13..1a9f3d0ef5 100644
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -14,9 +14,8 @@ use utils::{
 };
 
 use crate::{
-    cloud_admin_api::CloudAdminApiClient, init_remote_generic,
-    metadata_stream::stream_listing_generic, BucketConfig, ConsoleConfig, NodeKind, RootTarget,
-    TenantShardTimelineId,
+    cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing,
+    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId,
 };
 
 /// Generally we should ask safekeepers, but so far we use everywhere default 16MB.
@@ -107,7 +106,7 @@ pub async fn scan_safekeeper_metadata(
     let timelines = client.query(&query, &[]).await?;
     info!("loaded {} timelines", timelines.len());
 
-    let (remote_client, target) = init_remote_generic(bucket_config, NodeKind::Safekeeper).await?;
+    let (remote_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?;
     let console_config = ConsoleConfig::from_env()?;
     let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
 
@@ -193,7 +192,7 @@ async fn check_timeline(
         .strip_prefix("/")
         .unwrap_or(&timeline_dir_target.prefix_in_bucket);
 
-    let mut stream = std::pin::pin!(stream_listing_generic(remote_client, &timeline_dir_target));
+    let mut stream = std::pin::pin!(stream_listing(remote_client, &timeline_dir_target));
     while let Some(obj) = stream.next().await {
         let (key, _obj) = obj?;
 
diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs
index fc3a973922..bb4079b5f4 100644
--- a/storage_scrubber/src/tenant_snapshot.rs
+++ b/storage_scrubber/src/tenant_snapshot.rs
@@ -1,10 +1,10 @@
 use std::collections::HashMap;
 use std::sync::Arc;
 
-use crate::checks::{list_timeline_blobs_generic, BlobDataParseResult, RemoteTimelineBlobData};
-use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines_generic};
+use crate::checks::{list_timeline_blobs, BlobDataParseResult, RemoteTimelineBlobData};
+use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines};
 use crate::{
-    download_object_to_file, init_remote, init_remote_generic, BucketConfig, NodeKind, RootTarget,
+    download_object_to_file_s3, init_remote, init_remote_s3, BucketConfig, NodeKind, RootTarget,
     TenantShardTimelineId,
 };
 use anyhow::Context;
@@ -36,7 +36,8 @@ impl SnapshotDownloader {
         output_path: Utf8PathBuf,
         concurrency: usize,
     ) -> anyhow::Result<Self> {
-        let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
+        let (s3_client, s3_root) =
+            init_remote_s3(bucket_config.clone(), NodeKind::Pageserver).await?;
         Ok(Self {
             s3_client,
             s3_root,
@@ -93,7 +94,7 @@ impl SnapshotDownloader {
             let Some(version) = versions.versions.as_ref().and_then(|v| v.first()) else {
                 return Err(anyhow::anyhow!("No versions found for {remote_layer_path}"));
             };
-            download_object_to_file(
+            download_object_to_file_s3(
                 &self.s3_client,
                 &self.bucket_config.bucket,
                 &remote_layer_path,
@@ -218,7 +219,7 @@ impl SnapshotDownloader {
 
     pub async fn download(&self) -> anyhow::Result<()> {
         let (remote_client, target) =
-            init_remote_generic(self.bucket_config.clone(), NodeKind::Pageserver).await?;
+            init_remote(self.bucket_config.clone(), NodeKind::Pageserver).await?;
 
         // Generate a stream of TenantShardId
         let shards = stream_tenant_shards(&remote_client, &target, self.tenant_id).await?;
@@ -239,7 +240,7 @@ impl SnapshotDownloader {
 
         for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {
             // Generate a stream of TenantTimelineId
-            let timelines = stream_tenant_timelines_generic(&remote_client, &target, shard).await?;
+            let timelines = stream_tenant_timelines(&remote_client, &target, shard).await?;
 
             // Generate a stream of S3TimelineBlobData
             async fn load_timeline_index(
@@ -247,7 +248,7 @@ impl SnapshotDownloader {
                 target: &RootTarget,
                 ttid: TenantShardTimelineId,
             ) -> anyhow::Result<(TenantShardTimelineId, RemoteTimelineBlobData)> {
-                let data = list_timeline_blobs_generic(remote_client, ttid, target).await?;
+                let data = list_timeline_blobs(remote_client, ttid, target).await?;
                 Ok((ttid, data))
             }
             let timelines =

From 4c5a0fdc75c16b007ed9c042d41ec37bae1c0f75 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 7 Aug 2024 19:26:06 +0300
Subject: [PATCH 06/55] safekeeper: check for non-consecutive writes in
 safekeeper.rs

wal_storage.rs already checks this, but since this is a quite legit scenario
check it at safekeeper.rs (consensus level) as well.

ref https://github.com/neondatabase/neon/issues/8212
---
 safekeeper/src/safekeeper.rs                  | 27 +++++++++++++++++++
 safekeeper/src/wal_storage.rs                 |  6 +++++
 .../tests/walproposer_sim/safekeeper_disk.rs  |  4 +++
 3 files changed, 37 insertions(+)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 0814d9ba67..9d103887ae 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -875,6 +875,29 @@ where
             return Ok(Some(AcceptorProposerMessage::AppendResponse(resp)));
         }
 
+        // Disallow any non-sequential writes, which can result in gaps or
+        // overwrites. If we need to move the pointer, ProposerElected message
+        // should have truncated WAL first accordingly. Note that the first
+        // condition (WAL rewrite) is quite expected in real world; it happens
+        // when walproposer reconnects to safekeeper and writes some more data
+        // while first connection still gets some packets later. It might be
+        // better to not log this as error! above.
+        let write_lsn = self.wal_store.write_lsn();
+        if write_lsn > msg.h.begin_lsn {
+            bail!(
+                "append request rewrites WAL written before, write_lsn={}, msg lsn={}",
+                write_lsn,
+                msg.h.begin_lsn
+            );
+        }
+        if write_lsn < msg.h.begin_lsn && write_lsn != Lsn(0) {
+            bail!(
+                "append request creates gap in written WAL, write_lsn={}, msg lsn={}",
+                write_lsn,
+                msg.h.begin_lsn,
+            );
+        }
+
         // Now we know that we are in the same term as the proposer,
         // processing the message.
 
@@ -1005,6 +1028,10 @@ mod tests {
 
     #[async_trait::async_trait]
     impl wal_storage::Storage for DummyWalStore {
+        fn write_lsn(&self) -> Lsn {
+            self.lsn
+        }
+
         fn flush_lsn(&self) -> Lsn {
             self.lsn
         }
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index ded8571a3e..5136bdb9d8 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -37,6 +37,8 @@ use utils::{id::TenantTimelineId, lsn::Lsn};
 
 #[async_trait::async_trait]
 pub trait Storage {
+    // Last written LSN.
+    fn write_lsn(&self) -> Lsn;
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn;
 
@@ -327,6 +329,10 @@ impl PhysicalStorage {
 
 #[async_trait::async_trait]
 impl Storage for PhysicalStorage {
+    // Last written LSN.
+    fn write_lsn(&self) -> Lsn {
+        self.write_lsn
+    }
     /// flush_lsn returns LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
         self.flush_record_lsn
diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
index c2db9de78a..be56e86562 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
@@ -177,6 +177,10 @@ impl DiskWALStorage {
 
 #[async_trait::async_trait]
 impl wal_storage::Storage for DiskWALStorage {
+    // Last written LSN.
+    fn write_lsn(&self) -> Lsn {
+        self.write_lsn
+    }
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
         self.flush_record_lsn

From ef57e73fbf4ab4972d07e598d0b1ab3139a5abbf Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 20 Aug 2024 10:26:44 +0200
Subject: [PATCH 07/55] task_mgr::spawn: require a `TenantId` (#8462)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… to dis-incentivize global tasks via task_mgr in the future

(As of https://github.com/neondatabase/neon/pull/8339 all remaining
task_mgr usage is tenant or timeline scoped.)
---
 pageserver/src/task_mgr.rs                      | 15 +++++----------
 pageserver/src/tenant.rs                        |  2 +-
 pageserver/src/tenant/remote_timeline_client.rs |  2 +-
 pageserver/src/tenant/tasks.rs                  |  6 +++---
 pageserver/src/tenant/timeline.rs               |  8 ++++----
 pageserver/src/tenant/timeline/delete.rs        |  2 +-
 pageserver/src/tenant/timeline/eviction_task.rs |  2 +-
 7 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 5cd78874c1..ed9e001fd2 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -393,7 +393,7 @@ struct PageServerTask {
 
     /// Tasks may optionally be launched for a particular tenant/timeline, enabling
     /// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
-    tenant_shard_id: Option<TenantShardId>,
+    tenant_shard_id: TenantShardId,
     timeline_id: Option<TimelineId>,
 
     mutable: Mutex<MutableTaskState>,
@@ -405,7 +405,7 @@ struct PageServerTask {
 pub fn spawn<F>(
     runtime: &tokio::runtime::Handle,
     kind: TaskKind,
-    tenant_shard_id: Option<TenantShardId>,
+    tenant_shard_id: TenantShardId,
     timeline_id: Option<TimelineId>,
     name: &str,
     future: F,
@@ -550,7 +550,7 @@ pub async fn shutdown_tasks(
         let tasks = TASKS.lock().unwrap();
         for task in tasks.values() {
             if (kind.is_none() || Some(task.kind) == kind)
-                && (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id)
+                && (tenant_shard_id.is_none() || Some(task.tenant_shard_id) == tenant_shard_id)
                 && (timeline_id.is_none() || task.timeline_id == timeline_id)
             {
                 task.cancel.cancel();
@@ -573,13 +573,8 @@ pub async fn shutdown_tasks(
         };
         if let Some(mut join_handle) = join_handle {
             if log_all {
-                if tenant_shard_id.is_none() {
-                    // there are quite few of these
-                    info!(name = task.name, kind = ?task_kind, "stopping global task");
-                } else {
-                    // warn to catch these in tests; there shouldn't be any
-                    warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
-                }
+                // warn to catch these in tests; there shouldn't be any
+                warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
             }
             if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
                 .await
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 8ab8d08ce1..65a7504b74 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -798,7 +798,7 @@ impl Tenant {
         task_mgr::spawn(
             &tokio::runtime::Handle::current(),
             TaskKind::Attach,
-            Some(tenant_shard_id),
+            tenant_shard_id,
             None,
             "attach tenant",
             async move {
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index b4d7ad1e97..71b766e4c7 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1728,7 +1728,7 @@ impl RemoteTimelineClient {
             task_mgr::spawn(
                 &self.runtime,
                 TaskKind::RemoteUploadTask,
-                Some(self.tenant_shard_id),
+                self.tenant_shard_id,
                 Some(self.timeline_id),
                 "remote upload",
                 async move {
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index dbcd704b4e..3972685a8e 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -98,7 +98,7 @@ pub fn start_background_loops(
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::Compaction,
-        Some(tenant_shard_id),
+        tenant_shard_id,
         None,
         &format!("compactor for tenant {tenant_shard_id}"),
         {
@@ -121,7 +121,7 @@ pub fn start_background_loops(
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::GarbageCollector,
-        Some(tenant_shard_id),
+        tenant_shard_id,
         None,
         &format!("garbage collector for tenant {tenant_shard_id}"),
         {
@@ -144,7 +144,7 @@ pub fn start_background_loops(
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::IngestHousekeeping,
-        Some(tenant_shard_id),
+        tenant_shard_id,
         None,
         &format!("ingest housekeeping for tenant {tenant_shard_id}"),
         {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 26dc87c373..9732cf8b50 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2281,7 +2281,7 @@ impl Timeline {
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::LayerFlushTask,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
             Some(self.timeline_id),
             "layer flush task",
             async move {
@@ -2635,7 +2635,7 @@ impl Timeline {
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::InitialLogicalSizeCalculation,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
             Some(self.timeline_id),
             "initial size calculation",
             // NB: don't log errors here, task_mgr will do that.
@@ -2803,7 +2803,7 @@ impl Timeline {
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::OndemandLogicalSizeCalculation,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
             Some(self.timeline_id),
             "ondemand logical size calculation",
             async move {
@@ -5162,7 +5162,7 @@ impl Timeline {
         let task_id = task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::DownloadAllRemoteLayers,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
             Some(self.timeline_id),
             "download all remote layers task",
             async move {
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index b03dbb092e..dc4118bb4a 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -395,7 +395,7 @@ impl DeleteTimelineFlow {
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             TaskKind::TimelineDeletionWorker,
-            Some(tenant_shard_id),
+            tenant_shard_id,
             Some(timeline_id),
             "timeline_delete",
             async move {
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index eaa9c0ff62..2f6cb4d73a 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -60,7 +60,7 @@ impl Timeline {
         task_mgr::spawn(
             BACKGROUND_RUNTIME.handle(),
             TaskKind::Eviction,
-            Some(self.tenant_shard_id),
+            self.tenant_shard_id,
             Some(self.timeline_id),
             &format!(
                 "layer eviction for {}/{}",

From c96593b473a22e76ce6dae912177128c3ec21867 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 20 Aug 2024 10:46:58 +0100
Subject: [PATCH 08/55] Make Postgres 16 default version (#8745)

## Problem

The default Postgres version is set to 15 in code, while we use 16 in
most of the other places (and Postgres 17 is coming)

## Summary of changes
- Run `benchmarks` job with Postgres 16 (instead of Postgres 14)
- Set `DEFAULT_PG_VERSION` to 16 in all places
- Remove deprecated `--pg-version` pytest argument
- Update `test_metadata_bincode_serde_ensure_roundtrip` for Postgres 16
---
 .github/actions/run-python-test-set/action.yml |  2 +-
 .github/workflows/build_and_test.yml           |  1 +
 README.md                                      |  2 +-
 control_plane/src/bin/neon_local.rs            |  2 +-
 control_plane/src/local_env.rs                 |  2 +-
 pageserver/src/lib.rs                          |  2 +-
 pageserver/src/tenant/metadata.rs              |  4 ++--
 scripts/ps_ec2_setup_instance_store            |  2 +-
 test_runner/README.md                          |  3 +--
 test_runner/fixtures/pg_version.py             | 18 +-----------------
 test_runner/performance/README.md              |  2 +-
 test_runner/performance/pageserver/README.md   |  2 +-
 .../interactive/test_many_small_tenants.py     |  2 +-
 test_runner/regress/test_compatibility.py      |  4 ++--
 14 files changed, 16 insertions(+), 32 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 4ccf190c6a..814067fb8e 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -43,7 +43,7 @@ inputs:
   pg_version:
     description: 'Postgres version to use for tests'
     required: false
-    default: 'v14'
+    default: 'v16'
   benchmark_durations:
     description: 'benchmark durations JSON'
     required: false
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 92fff4ffbc..715f1af656 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -280,6 +280,7 @@ jobs:
           save_perf_report: ${{ github.ref_name == 'main' }}
           extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
           benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
+          pg_version: v16
         env:
           VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
diff --git a/README.md b/README.md
index ea0a289502..f01442da5d 100644
--- a/README.md
+++ b/README.md
@@ -262,7 +262,7 @@ By default, this runs both debug and release modes, and all supported postgres v
 testing locally, it is convenient to run just one set of permutations, like this:
 
 ```sh
-DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
+DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest
 ```
 
 ## Flamegraphs
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index edd88dc71c..1d66532d49 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -54,7 +54,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);
 
-const DEFAULT_PG_VERSION: &str = "15";
+const DEFAULT_PG_VERSION: &str = "16";
 
 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
 
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 807519c88d..74caba2b56 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -27,7 +27,7 @@ use crate::pageserver::PageServerNode;
 use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
 use crate::safekeeper::SafekeeperNode;
 
-pub const DEFAULT_PG_VERSION: u32 = 15;
+pub const DEFAULT_PG_VERSION: u32 = 16;
 
 //
 // This data structures represents neon_local CLI config
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 5aee13cfc6..5829a1c188 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -49,7 +49,7 @@ use tracing::{info, info_span};
 /// backwards-compatible changes to the metadata format.
 pub const STORAGE_FORMAT_VERSION: u16 = 3;
 
-pub const DEFAULT_PG_VERSION: u32 = 15;
+pub const DEFAULT_PG_VERSION: u32 = 16;
 
 // Magic constants used to identify different kinds of files
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 6073abc8c3..190316df42 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -565,7 +565,7 @@ mod tests {
         );
         let expected_bytes = vec![
             /* TimelineMetadataHeader */
-            4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
+            74, 104, 158, 105, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
             /* TimelineMetadataBodyV2 */
             0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes)
             1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes)
@@ -574,7 +574,7 @@ mod tests {
             0, 0, 0, 0, 0, 0, 0, 0, // ancestor_lsn (8 bytes)
             0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes)
             0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes)
-            0, 0, 0, 15, // pg_version (4 bytes)
+            0, 0, 0, 16, // pg_version (4 bytes)
             /* padding bytes */
             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
diff --git a/scripts/ps_ec2_setup_instance_store b/scripts/ps_ec2_setup_instance_store
index 1f88f252eb..7c383e322f 100755
--- a/scripts/ps_ec2_setup_instance_store
+++ b/scripts/ps_ec2_setup_instance_store
@@ -44,7 +44,7 @@ run the following commands from the top of the neon.git checkout
 
     # test suite run
     export TEST_OUTPUT="$TEST_OUTPUT"
-    DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py
+    DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py
 
     # for interactive use
     export NEON_REPO_DIR="$NEON_REPO_DIR"
diff --git a/test_runner/README.md b/test_runner/README.md
index e2f26a19ce..73aa29d4bb 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -71,8 +71,7 @@ a subdirectory for each version with naming convention `v{PG_VERSION}/`.
 Inside that dir, a `bin/postgres` binary should be present.
 `DEFAULT_PG_VERSION`: The version of Postgres to use,
 This is used to construct full path to the postgres binaries.
-Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION="14"`. Alternatively,
-you can use `--pg-version` argument.
+Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION=16`
 `TEST_OUTPUT`: Set the directory where test state and test output files
 should go.
 `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.
diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py
index 941889a2f5..e12c8e5f4a 100644
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -3,8 +3,6 @@ import os
 from typing import Optional
 
 import pytest
-from _pytest.config import Config
-from _pytest.config.argparsing import Parser
 
 """
 This fixture is used to determine which version of Postgres to use for tests.
@@ -52,7 +50,7 @@ class PgVersion(str, enum.Enum):
         return None
 
 
-DEFAULT_VERSION: PgVersion = PgVersion.V15
+DEFAULT_VERSION: PgVersion = PgVersion.V16
 
 
 def skip_on_postgres(version: PgVersion, reason: str):
@@ -69,22 +67,8 @@ def xfail_on_postgres(version: PgVersion, reason: str):
     )
 
 
-def pytest_addoption(parser: Parser):
-    parser.addoption(
-        "--pg-version",
-        action="store",
-        type=PgVersion,
-        help="DEPRECATED: Postgres version to use for tests",
-    )
-
-
 def run_only_on_default_postgres(reason: str):
     return pytest.mark.skipif(
         PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is not DEFAULT_VERSION,
         reason=reason,
     )
-
-
-def pytest_configure(config: Config):
-    if config.getoption("--pg-version"):
-        raise Exception("--pg-version is deprecated, use DEFAULT_PG_VERSION env var instead")
diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md
index 7ad65821d4..70d75a6dcf 100644
--- a/test_runner/performance/README.md
+++ b/test_runner/performance/README.md
@@ -7,7 +7,7 @@ easier to see if you have compile errors without scrolling up.
 You may also need to run `./scripts/pysync`.
 
 Then run the tests
-`DEFAULT_PG_VERSION=15 NEON_BIN=./target/release poetry run pytest test_runner/performance`
+`DEFAULT_PG_VERSION=16 NEON_BIN=./target/release poetry run pytest test_runner/performance`
 
 Some handy pytest flags for local development:
 - `-x` tells pytest to stop on first error
diff --git a/test_runner/performance/pageserver/README.md b/test_runner/performance/pageserver/README.md
index fdd09cd946..56ffad9963 100644
--- a/test_runner/performance/pageserver/README.md
+++ b/test_runner/performance/pageserver/README.md
@@ -11,6 +11,6 @@ It supports mounting snapshots using overlayfs, which improves iteration time.
 Here's a full command line.
 
 ```
-RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=15 BUILD_TYPE=release \
+RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=16 BUILD_TYPE=release \
     ./scripts/pytest test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
 ````
diff --git a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
index 33848b06d3..8d781c1609 100644
--- a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
+++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
@@ -14,7 +14,7 @@ from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking
 
 """
 Usage:
-DEFAULT_PG_VERSION=15 BUILD_TYPE=debug NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 INTERACTIVE=true \
+DEFAULT_PG_VERSION=16 BUILD_TYPE=debug NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 INTERACTIVE=true \
     ./scripts/pytest --timeout 0 test_runner/performance/pageserver/interactive/test_many_small_tenants.py
 """
 
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index afa5f6873c..30ff40b7df 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -39,7 +39,7 @@ from fixtures.workload import Workload
 #
 # How to run `test_backward_compatibility` locally:
 #
-#    export DEFAULT_PG_VERSION=15
+#    export DEFAULT_PG_VERSION=16
 #    export BUILD_TYPE=release
 #    export CHECK_ONDISK_DATA_COMPATIBILITY=true
 #    export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION}
@@ -61,7 +61,7 @@ from fixtures.workload import Workload
 #
 # How to run `test_forward_compatibility` locally:
 #
-#    export DEFAULT_PG_VERSION=15
+#    export DEFAULT_PG_VERSION=16
 #    export BUILD_TYPE=release
 #    export CHECK_ONDISK_DATA_COMPATIBILITY=true
 #    export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE}

From 02a28c01ca1083e4fb2fc28b2db761220c161ff7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 20 Aug 2024 12:34:53 +0100
Subject: [PATCH 09/55] Revert "safekeeper: check for non-consecutive writes in
 safekeeper.rs" (#8771)

Reverts neondatabase/neon#8640

This broke `test_last_log_term_switch` via a merge race of some kind.
---
 safekeeper/src/safekeeper.rs                  | 27 -------------------
 safekeeper/src/wal_storage.rs                 |  6 -----
 .../tests/walproposer_sim/safekeeper_disk.rs  |  4 ---
 3 files changed, 37 deletions(-)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 9d103887ae..0814d9ba67 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -875,29 +875,6 @@ where
             return Ok(Some(AcceptorProposerMessage::AppendResponse(resp)));
         }
 
-        // Disallow any non-sequential writes, which can result in gaps or
-        // overwrites. If we need to move the pointer, ProposerElected message
-        // should have truncated WAL first accordingly. Note that the first
-        // condition (WAL rewrite) is quite expected in real world; it happens
-        // when walproposer reconnects to safekeeper and writes some more data
-        // while first connection still gets some packets later. It might be
-        // better to not log this as error! above.
-        let write_lsn = self.wal_store.write_lsn();
-        if write_lsn > msg.h.begin_lsn {
-            bail!(
-                "append request rewrites WAL written before, write_lsn={}, msg lsn={}",
-                write_lsn,
-                msg.h.begin_lsn
-            );
-        }
-        if write_lsn < msg.h.begin_lsn && write_lsn != Lsn(0) {
-            bail!(
-                "append request creates gap in written WAL, write_lsn={}, msg lsn={}",
-                write_lsn,
-                msg.h.begin_lsn,
-            );
-        }
-
         // Now we know that we are in the same term as the proposer,
         // processing the message.
 
@@ -1028,10 +1005,6 @@ mod tests {
 
     #[async_trait::async_trait]
     impl wal_storage::Storage for DummyWalStore {
-        fn write_lsn(&self) -> Lsn {
-            self.lsn
-        }
-
         fn flush_lsn(&self) -> Lsn {
             self.lsn
         }
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 5136bdb9d8..ded8571a3e 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -37,8 +37,6 @@ use utils::{id::TenantTimelineId, lsn::Lsn};
 
 #[async_trait::async_trait]
 pub trait Storage {
-    // Last written LSN.
-    fn write_lsn(&self) -> Lsn;
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn;
 
@@ -329,10 +327,6 @@ impl PhysicalStorage {
 
 #[async_trait::async_trait]
 impl Storage for PhysicalStorage {
-    // Last written LSN.
-    fn write_lsn(&self) -> Lsn {
-        self.write_lsn
-    }
     /// flush_lsn returns LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
         self.flush_record_lsn
diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
index be56e86562..c2db9de78a 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
@@ -177,10 +177,6 @@ impl DiskWALStorage {
 
 #[async_trait::async_trait]
 impl wal_storage::Storage for DiskWALStorage {
-    // Last written LSN.
-    fn write_lsn(&self) -> Lsn {
-        self.write_lsn
-    }
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn {
         self.flush_record_lsn

From 1c96957e85700eaa0333fb0230f5135e7a982d1e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 20 Aug 2024 14:00:36 +0100
Subject: [PATCH 10/55] storcon: run db migrations after step down sequence
 (#8756)

## Problem

Previously, we would run db migrations before doing the step-down
sequence. This meant that the current leader would have to deal with
the schema changes and that's generally not safe.

## Summary of changes

Push the step-down procedure earlier in start-up and
do db migrations right after it (but before we load-up the in-memory
state from the db).

Epic: https://github.com/neondatabase/cloud/issues/14701
---
 control_plane/src/storage_controller.rs |   3 +-
 storage_controller/src/leadership.rs    | 136 ++++++++++++++++++++++++
 storage_controller/src/lib.rs           |   1 +
 storage_controller/src/main.rs          |  24 +----
 storage_controller/src/metrics.rs       |   2 +
 storage_controller/src/persistence.rs   |  18 ++++
 storage_controller/src/service.rs       | 102 ++++--------------
 7 files changed, 180 insertions(+), 106 deletions(-)
 create mode 100644 storage_controller/src/leadership.rs

diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 2c077595a1..f6539ad5b0 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -217,7 +217,7 @@ impl StorageController {
         Ok(exitcode.success())
     }
 
-    /// Create our database if it doesn't exist, and run migrations.
+    /// Create our database if it doesn't exist
     ///
     /// This function is equivalent to the `diesel setup` command in the diesel CLI.  We implement
     /// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
@@ -382,7 +382,6 @@ impl StorageController {
             )
             .await?;
 
-            // Run migrations on every startup, in case something changed.
             self.setup_database(postgres_port).await?;
         }
 
diff --git a/storage_controller/src/leadership.rs b/storage_controller/src/leadership.rs
new file mode 100644
index 0000000000..a171bab451
--- /dev/null
+++ b/storage_controller/src/leadership.rs
@@ -0,0 +1,136 @@
+use std::sync::Arc;
+
+use hyper::Uri;
+use tokio_util::sync::CancellationToken;
+
+use crate::{
+    peer_client::{GlobalObservedState, PeerClient},
+    persistence::{ControllerPersistence, DatabaseError, DatabaseResult, Persistence},
+    service::Config,
+};
+
+/// Helper for storage controller leadership acquisition
+pub(crate) struct Leadership {
+    persistence: Arc<Persistence>,
+    config: Config,
+    cancel: CancellationToken,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum Error {
+    #[error(transparent)]
+    Database(#[from] DatabaseError),
+}
+
+pub(crate) type Result<T> = std::result::Result<T, Error>;
+
+impl Leadership {
+    pub(crate) fn new(
+        persistence: Arc<Persistence>,
+        config: Config,
+        cancel: CancellationToken,
+    ) -> Self {
+        Self {
+            persistence,
+            config,
+            cancel,
+        }
+    }
+
+    /// Find the current leader in the database and request it to step down if required.
+    /// Should be called early on in within the start-up sequence.
+    ///
+    /// Returns a tuple of two optionals: the current leader and its observed state
+    pub(crate) async fn step_down_current_leader(
+        &self,
+    ) -> Result<(Option<ControllerPersistence>, Option<GlobalObservedState>)> {
+        let leader = self.current_leader().await?;
+        let leader_step_down_state = if let Some(ref leader) = leader {
+            if self.config.start_as_candidate {
+                self.request_step_down(leader).await
+            } else {
+                None
+            }
+        } else {
+            tracing::info!("No leader found to request step down from. Will build observed state.");
+            None
+        };
+
+        Ok((leader, leader_step_down_state))
+    }
+
+    /// Mark the current storage controller instance as the leader in the database
+    pub(crate) async fn become_leader(
+        &self,
+        current_leader: Option<ControllerPersistence>,
+    ) -> Result<()> {
+        if let Some(address_for_peers) = &self.config.address_for_peers {
+            // TODO: `address-for-peers` can become a mandatory cli arg
+            // after we update the k8s setup
+            let proposed_leader = ControllerPersistence {
+                address: address_for_peers.to_string(),
+                started_at: chrono::Utc::now(),
+            };
+
+            self.persistence
+                .update_leader(current_leader, proposed_leader)
+                .await
+                .map_err(Error::Database)
+        } else {
+            tracing::info!("No address-for-peers provided. Skipping leader persistence.");
+            Ok(())
+        }
+    }
+
+    async fn current_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
+        let res = self.persistence.get_leader().await;
+        if let Err(DatabaseError::Query(diesel::result::Error::DatabaseError(_kind, ref err))) = res
+        {
+            const REL_NOT_FOUND_MSG: &str = "relation \"controllers\" does not exist";
+            if err.message().trim() == REL_NOT_FOUND_MSG {
+                // Special case: if this is a brand new storage controller, migrations will not
+                // have run at this point yet, and, hence, the controllers table does not exist.
+                // Detect this case via the error string (diesel doesn't type it) and allow it.
+                tracing::info!("Detected first storage controller start-up. Allowing missing controllers table ...");
+                return Ok(None);
+            }
+        }
+
+        res
+    }
+
+    /// Request step down from the currently registered leader in the database
+    ///
+    /// If such an entry is persisted, the success path returns the observed
+    /// state and details of the leader. Otherwise, None is returned indicating
+    /// there is no leader currently.
+    async fn request_step_down(
+        &self,
+        leader: &ControllerPersistence,
+    ) -> Option<GlobalObservedState> {
+        tracing::info!("Sending step down request to {leader:?}");
+
+        // TODO: jwt token
+        let client = PeerClient::new(
+            Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
+            self.config.jwt_token.clone(),
+        );
+        let state = client.step_down(&self.cancel).await;
+        match state {
+            Ok(state) => Some(state),
+            Err(err) => {
+                // TODO: Make leaders periodically update a timestamp field in the
+                // database and, if the leader is not reachable from the current instance,
+                // but inferred as alive from the timestamp, abort start-up. This avoids
+                // a potential scenario in which we have two controllers acting as leaders.
+                tracing::error!(
+                    "Leader ({}) did not respond to step-down request: {}",
+                    leader.address,
+                    err
+                );
+
+                None
+            }
+        }
+    }
+}
diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs
index 2034addbe1..60e613bb5c 100644
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -8,6 +8,7 @@ mod drain_utils;
 mod heartbeater;
 pub mod http;
 mod id_lock_map;
+mod leadership;
 pub mod metrics;
 mod node;
 mod pageserver_client;
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 7387d36690..17685b1140 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -1,6 +1,5 @@
 use anyhow::{anyhow, Context};
 use clap::Parser;
-use diesel::Connection;
 use hyper::Uri;
 use metrics::launch_timestamp::LaunchTimestamp;
 use metrics::BuildInfo;
@@ -27,9 +26,6 @@ use utils::{project_build_tag, project_git_version, tcp_listener};
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
 
-use diesel_migrations::{embed_migrations, EmbeddedMigrations};
-pub const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
-
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
 #[command(arg_required_else_help(true))]
@@ -181,20 +177,6 @@ impl Secrets {
     }
 }
 
-/// Execute the diesel migrations that are built into this binary
-async fn migration_run(database_url: &str) -> anyhow::Result<()> {
-    use diesel::PgConnection;
-    use diesel_migrations::{HarnessWithOutput, MigrationHarness};
-    let mut conn = PgConnection::establish(database_url)?;
-
-    HarnessWithOutput::write_to_stdout(&mut conn)
-        .run_pending_migrations(MIGRATIONS)
-        .map(|_| ())
-        .map_err(|e| anyhow::anyhow!(e))?;
-
-    Ok(())
-}
-
 fn main() -> anyhow::Result<()> {
     logging::init(
         LogFormat::Plain,
@@ -304,13 +286,9 @@ async fn async_main() -> anyhow::Result<()> {
         http_service_port: args.listen.port() as i32,
     };
 
-    // After loading secrets & config, but before starting anything else, apply database migrations
+    // Validate that we can connect to the database
     Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
 
-    migration_run(&secrets.database_url)
-        .await
-        .context("Running database migrations")?;
-
     let persistence = Arc::new(Persistence::new(secrets.database_url));
 
     let service = Service::spawn(config, persistence.clone()).await?;
diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index c2303e7a7f..5cfcfb4b1f 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -230,6 +230,7 @@ pub(crate) enum DatabaseErrorLabel {
     Connection,
     ConnectionPool,
     Logical,
+    Migration,
 }
 
 impl DatabaseError {
@@ -239,6 +240,7 @@ impl DatabaseError {
             Self::Connection(_) => DatabaseErrorLabel::Connection,
             Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
             Self::Logical(_) => DatabaseErrorLabel::Logical,
+            Self::Migration(_) => DatabaseErrorLabel::Migration,
         }
     }
 }
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index aebbdec0d1..16df19026c 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -25,6 +25,9 @@ use crate::metrics::{
 };
 use crate::node::Node;
 
+use diesel_migrations::{embed_migrations, EmbeddedMigrations};
+const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
+
 /// ## What do we store?
 ///
 /// The storage controller service does not store most of its state durably.
@@ -72,6 +75,8 @@ pub(crate) enum DatabaseError {
     ConnectionPool(#[from] r2d2::Error),
     #[error("Logical error: {0}")]
     Logical(String),
+    #[error("Migration error: {0}")]
+    Migration(String),
 }
 
 #[derive(measured::FixedCardinalityLabel, Copy, Clone)]
@@ -167,6 +172,19 @@ impl Persistence {
         }
     }
 
+    /// Execute the diesel migrations that are built into this binary
+    pub(crate) async fn migration_run(&self) -> DatabaseResult<()> {
+        use diesel_migrations::{HarnessWithOutput, MigrationHarness};
+
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            HarnessWithOutput::write_to_stdout(conn)
+                .run_pending_migrations(MIGRATIONS)
+                .map(|_| ())
+                .map_err(|e| DatabaseError::Migration(e.to_string()))
+        })
+        .await
+    }
+
     /// Wraps `with_conn` in order to collect latency and error metrics
     async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
     where
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 3459b44774..780f4a7ee5 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -17,8 +17,9 @@ use crate::{
     compute_hook::NotifyError,
     drain_utils::{self, TenantShardDrain, TenantShardIterator},
     id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
+    leadership::Leadership,
     metrics,
-    peer_client::{GlobalObservedState, PeerClient},
+    peer_client::GlobalObservedState,
     persistence::{
         AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence,
         TenantFilter,
@@ -333,7 +334,7 @@ impl From<DatabaseError> for ApiError {
             DatabaseError::Connection(_) | DatabaseError::ConnectionPool(_) => {
                 ApiError::ShuttingDown
             }
-            DatabaseError::Logical(reason) => {
+            DatabaseError::Logical(reason) | DatabaseError::Migration(reason) => {
                 ApiError::InternalServerError(anyhow::anyhow!(reason))
             }
         }
@@ -606,22 +607,15 @@ impl Service {
 
         // Before making any obeservable changes to the cluster, persist self
         // as leader in database and memory.
-        if let Some(address_for_peers) = &self.config.address_for_peers {
-            // TODO: `address-for-peers` can become a mandatory cli arg
-            // after we update the k8s setup
-            let proposed_leader = ControllerPersistence {
-                address: address_for_peers.to_string(),
-                started_at: chrono::Utc::now(),
-            };
+        let leadership = Leadership::new(
+            self.persistence.clone(),
+            self.config.clone(),
+            self.cancel.child_token(),
+        );
 
-            if let Err(err) = self
-                .persistence
-                .update_leader(current_leader, proposed_leader)
-                .await
-            {
-                tracing::error!("Failed to persist self as leader: {err}. Aborting start-up ...");
-                std::process::exit(1);
-            }
+        if let Err(e) = leadership.become_leader(current_leader).await {
+            tracing::error!("Failed to persist self as leader: {e}. Aborting start-up ...");
+            std::process::exit(1);
         }
 
         self.inner.write().unwrap().become_leader();
@@ -1159,6 +1153,16 @@ impl Service {
         let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
         let (abort_tx, abort_rx) = tokio::sync::mpsc::unbounded_channel();
 
+        let leadership_cancel = CancellationToken::new();
+        let leadership = Leadership::new(persistence.clone(), config.clone(), leadership_cancel);
+        let (leader, leader_step_down_state) = leadership.step_down_current_leader().await?;
+
+        // Apply the migrations **after** the current leader has stepped down
+        // (or we've given up waiting for it), but **before** reading from the
+        // database. The only exception is reading the current leader before
+        // migrating.
+        persistence.migration_run().await?;
+
         tracing::info!("Loading nodes from database...");
         let nodes = persistence
             .list_nodes()
@@ -1376,32 +1380,6 @@ impl Service {
                     return;
                 };
 
-                let leadership_status = this.inner.read().unwrap().get_leadership_status();
-                let leader = match this.get_leader().await {
-                    Ok(ok) => ok,
-                    Err(err) => {
-                        tracing::error!(
-                            "Failed to query database for current leader: {err}. Aborting start-up ..."
-                        );
-                        std::process::exit(1);
-                    }
-                };
-
-                let leader_step_down_state = match leadership_status {
-                    LeadershipStatus::Candidate => {
-                        if let Some(ref leader) = leader {
-                            this.request_step_down(leader).await
-                        } else {
-                            tracing::info!(
-                                "No leader found to request step down from. Will build observed state."
-                            );
-                            None
-                        }
-                    }
-                    LeadershipStatus::Leader => None,
-                    LeadershipStatus::SteppedDown => unreachable!(),
-                };
-
                 this.startup_reconcile(leader, leader_step_down_state, bg_compute_notify_result_tx)
                     .await;
 
@@ -6377,42 +6355,4 @@ impl Service {
 
         global_observed
     }
-
-    /// Request step down from the currently registered leader in the database
-    ///
-    /// If such an entry is persisted, the success path returns the observed
-    /// state and details of the leader. Otherwise, None is returned indicating
-    /// there is no leader currently.
-    ///
-    /// On failures to query the database or step down error responses the process is killed
-    /// and we rely on k8s to retry.
-    async fn request_step_down(
-        &self,
-        leader: &ControllerPersistence,
-    ) -> Option<GlobalObservedState> {
-        tracing::info!("Sending step down request to {leader:?}");
-
-        // TODO: jwt token
-        let client = PeerClient::new(
-            Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
-            self.config.jwt_token.clone(),
-        );
-        let state = client.step_down(&self.cancel).await;
-        match state {
-            Ok(state) => Some(state),
-            Err(err) => {
-                // TODO: Make leaders periodically update a timestamp field in the
-                // database and, if the leader is not reachable from the current instance,
-                // but inferred as alive from the timestamp, abort start-up. This avoids
-                // a potential scenario in which we have two controllers acting as leaders.
-                tracing::error!(
-                    "Leader ({}) did not respond to step-down request: {}",
-                    leader.address,
-                    err
-                );
-
-                None
-            }
-        }
-    }
 }

From 0170611a97fc233f4e3bcc56648a77fb3cf33a2c Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 20 Aug 2024 14:16:27 +0100
Subject: [PATCH 11/55] proxy: small changes (#8752)

## Problem

#8736 is getting too big. splitting off some simple changes here

## Summary of changes

Local proxy wont always be using tls, so make it optional. Local proxy
wont be using ws for now, so make it optional. Remove a dead config var.
---
 proxy/src/bin/proxy.rs                |  4 +-
 proxy/src/config.rs                   |  1 +
 proxy/src/serverless.rs               | 59 ++++++++++++++++++++-------
 proxy/src/serverless/conn_pool.rs     |  1 +
 proxy/src/serverless/sql_over_http.rs | 26 ++++++++----
 5 files changed, 65 insertions(+), 26 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index b44e0ddd2f..d83a1f3bcf 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -173,9 +173,6 @@ struct ProxyCliArgs {
     /// cache for `role_secret` (use `size=0` to disable)
     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     role_secret_cache: String,
-    /// disable ip check for http requests. If it is too time consuming, it could be turned off.
-    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    disable_ip_check_for_http: bool,
     /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
     #[clap(long)]
     redis_notifications: Option<String>,
@@ -661,6 +658,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
     )?;
 
     let http_config = HttpConfig {
+        accept_websockets: true,
         pool_options: GlobalConnPoolOptions {
             max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
             gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 36d04924f2..a280aa88ce 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -52,6 +52,7 @@ pub struct TlsConfig {
 }
 
 pub struct HttpConfig {
+    pub accept_websockets: bool,
     pub pool_options: GlobalConnPoolOptions,
     pub cancel_set: CancelSet,
     pub client_conn_threshold: u64,
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 5416d63b5b..b2bf93dc6d 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -10,6 +10,7 @@ mod json;
 mod sql_over_http;
 mod websocket;
 
+use async_trait::async_trait;
 use atomic_take::AtomicTake;
 use bytes::Bytes;
 pub use conn_pool::GlobalConnPoolOptions;
@@ -26,8 +27,9 @@ use rand::rngs::StdRng;
 use rand::SeedableRng;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
+use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::time::timeout;
-use tokio_rustls::{server::TlsStream, TlsAcceptor};
+use tokio_rustls::TlsAcceptor;
 use tokio_util::task::TaskTracker;
 
 use crate::cancellation::CancellationHandlerMain;
@@ -41,7 +43,7 @@ use crate::serverless::backend::PoolingBackend;
 use crate::serverless::http_util::{api_error_into_response, json_response};
 
 use std::net::{IpAddr, SocketAddr};
-use std::pin::pin;
+use std::pin::{pin, Pin};
 use std::sync::Arc;
 use tokio::net::{TcpListener, TcpStream};
 use tokio_util::sync::CancellationToken;
@@ -86,18 +88,18 @@ pub async fn task_main(
         config,
         endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter),
     });
-
-    let tls_config = match config.tls_config.as_ref() {
-        Some(config) => config,
+    let tls_acceptor: Arc<dyn MaybeTlsAcceptor> = match config.tls_config.as_ref() {
+        Some(config) => {
+            let mut tls_server_config = rustls::ServerConfig::clone(&config.to_server_config());
+            // prefer http2, but support http/1.1
+            tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
+            Arc::new(tls_server_config) as Arc<_>
+        }
         None => {
-            warn!("TLS config is missing, WebSocket Secure server will not be started");
-            return Ok(());
+            warn!("TLS config is missing");
+            Arc::new(NoTls) as Arc<_>
         }
     };
-    let mut tls_server_config = rustls::ServerConfig::clone(&tls_config.to_server_config());
-    // prefer http2, but support http/1.1
-    tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
-    let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into();
 
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
     connections.close(); // allows `connections.wait to complete`
@@ -176,16 +178,41 @@ pub async fn task_main(
     Ok(())
 }
 
+pub trait AsyncReadWrite: AsyncRead + AsyncWrite + Send + 'static {}
+impl<T: AsyncRead + AsyncWrite + Send + 'static> AsyncReadWrite for T {}
+pub type AsyncRW = Pin<Box<dyn AsyncReadWrite>>;
+
+#[async_trait]
+trait MaybeTlsAcceptor: Send + Sync + 'static {
+    async fn accept(self: Arc<Self>, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW>;
+}
+
+#[async_trait]
+impl MaybeTlsAcceptor for rustls::ServerConfig {
+    async fn accept(self: Arc<Self>, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW> {
+        Ok(Box::pin(TlsAcceptor::from(self).accept(conn).await?))
+    }
+}
+
+struct NoTls;
+
+#[async_trait]
+impl MaybeTlsAcceptor for NoTls {
+    async fn accept(self: Arc<Self>, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW> {
+        Ok(Box::pin(conn))
+    }
+}
+
 /// Handles the TCP startup lifecycle.
 /// 1. Parses PROXY protocol V2
 /// 2. Handles TLS handshake
 async fn connection_startup(
     config: &ProxyConfig,
-    tls_acceptor: TlsAcceptor,
+    tls_acceptor: Arc<dyn MaybeTlsAcceptor>,
     session_id: uuid::Uuid,
     conn: TcpStream,
     peer_addr: SocketAddr,
-) -> Option<(TlsStream<ChainRW<TcpStream>>, IpAddr)> {
+) -> Option<(AsyncRW, IpAddr)> {
     // handle PROXY protocol
     let (conn, peer) = match read_proxy_protocol(conn).await {
         Ok(c) => c,
@@ -241,7 +268,7 @@ async fn connection_handler(
     cancellation_handler: Arc<CancellationHandlerMain>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     cancellation_token: CancellationToken,
-    conn: TlsStream<ChainRW<TcpStream>>,
+    conn: AsyncRW,
     peer_addr: IpAddr,
     session_id: uuid::Uuid,
 ) {
@@ -326,7 +353,9 @@ async fn request_handler(
         .map(|s| s.to_string());
 
     // Check if the request is a websocket upgrade request.
-    if framed_websockets::upgrade::is_upgrade_request(&request) {
+    if config.http_config.accept_websockets
+        && framed_websockets::upgrade::is_upgrade_request(&request)
+    {
         let ctx = RequestMonitoring::new(
             session_id,
             peer_addr,
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 9ede659cc4..3478787995 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -758,6 +758,7 @@ mod tests {
     async fn test_pool() {
         let _ = env_logger::try_init();
         let config = Box::leak(Box::new(crate::config::HttpConfig {
+            accept_websockets: false,
             pool_options: GlobalConnPoolOptions {
                 max_conns_per_endpoint: 2,
                 gc_epoch: Duration::from_secs(1),
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index c41df07a4d..bbfed90f39 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -147,7 +147,7 @@ impl UserFacingError for ConnInfoError {
 fn get_conn_info(
     ctx: &RequestMonitoring,
     headers: &HeaderMap,
-    tls: &TlsConfig,
+    tls: Option<&TlsConfig>,
 ) -> Result<ConnInfo, ConnInfoError> {
     // HTTP only uses cleartext (for now and likely always)
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
@@ -184,12 +184,22 @@ fn get_conn_info(
         .ok_or(ConnInfoError::MissingPassword)?;
     let password = urlencoding::decode_binary(password.as_bytes());
 
-    let hostname = connection_url
-        .host_str()
-        .ok_or(ConnInfoError::MissingHostname)?;
-
-    let endpoint =
-        endpoint_sni(hostname, &tls.common_names)?.ok_or(ConnInfoError::MalformedEndpoint)?;
+    let endpoint = match connection_url.host() {
+        Some(url::Host::Domain(hostname)) => {
+            if let Some(tls) = tls {
+                endpoint_sni(hostname, &tls.common_names)?
+                    .ok_or(ConnInfoError::MalformedEndpoint)?
+            } else {
+                hostname
+                    .split_once(".")
+                    .map_or(hostname, |(prefix, _)| prefix)
+                    .into()
+            }
+        }
+        Some(url::Host::Ipv4(_)) | Some(url::Host::Ipv6(_)) | None => {
+            return Err(ConnInfoError::MissingHostname)
+        }
+    };
     ctx.set_endpoint_id(endpoint.clone());
 
     let pairs = connection_url.query_pairs();
@@ -502,7 +512,7 @@ async fn handle_inner(
     let headers = request.headers();
 
     // TLS config should be there.
-    let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?;
+    let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref())?;
     info!(user = conn_info.user_info.user.as_str(), "credentials");
 
     // Allow connection pooling only if explicitly requested

From fa0750a37e01cee2e909d91be9b556ee2f128406 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 20 Aug 2024 15:25:21 +0100
Subject: [PATCH 12/55] storcon: add peer jwt token (#8764)

## Problem

Storage controllers did not have the right token to speak to their peers
for leadership transitions.

## Summary of changes

Accept a peer jwt token for the storage controller.

Epic: https://github.com/neondatabase/cloud/issues/14701
---
 control_plane/src/storage_controller.rs       |  5 +++++
 storage_controller/src/leadership.rs          |  3 +--
 storage_controller/src/main.rs                | 20 +++++++++++--------
 storage_controller/src/service.rs             |  3 +++
 .../regress/test_storage_controller.py        |  2 ++
 5 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index f6539ad5b0..27d8e2de0c 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -453,6 +453,11 @@ impl StorageController {
             let jwt_token =
                 encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
             args.push(format!("--jwt-token={jwt_token}"));
+
+            let peer_claims = Claims::new(None, Scope::Admin);
+            let peer_jwt_token = encode_from_key_file(&peer_claims, private_key)
+                .expect("failed to generate jwt token");
+            args.push(format!("--peer-jwt-token={peer_jwt_token}"));
         }
 
         if let Some(public_key) = &self.public_key {
diff --git a/storage_controller/src/leadership.rs b/storage_controller/src/leadership.rs
index a171bab451..5fae8991ec 100644
--- a/storage_controller/src/leadership.rs
+++ b/storage_controller/src/leadership.rs
@@ -110,10 +110,9 @@ impl Leadership {
     ) -> Option<GlobalObservedState> {
         tracing::info!("Sending step down request to {leader:?}");
 
-        // TODO: jwt token
         let client = PeerClient::new(
             Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
-            self.config.jwt_token.clone(),
+            self.config.peer_jwt_token.clone(),
         );
         let state = client.step_down(&self.cancel).await;
         match state {
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 17685b1140..e3f29b84e7 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -47,6 +47,9 @@ struct Cli {
     #[arg(long)]
     control_plane_jwt_token: Option<String>,
 
+    #[arg(long)]
+    peer_jwt_token: Option<String>,
+
     /// URL to control plane compute notification endpoint
     #[arg(long)]
     compute_hook_url: Option<String>,
@@ -126,28 +129,28 @@ struct Secrets {
     public_key: Option<JwtAuth>,
     jwt_token: Option<String>,
     control_plane_jwt_token: Option<String>,
+    peer_jwt_token: Option<String>,
 }
 
 impl Secrets {
     const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
     const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
     const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
+    const PEER_JWT_TOKEN_ENV: &'static str = "PEER_JWT_TOKEN";
     const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";
 
     /// Load secrets from, in order of preference:
     /// - CLI args if database URL is provided on the CLI
     /// - Environment variables if DATABASE_URL is set.
-    /// - AWS Secrets Manager secrets
     async fn load(args: &Cli) -> anyhow::Result<Self> {
-        let Some(database_url) =
-            Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV).await
+        let Some(database_url) = Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV)
         else {
             anyhow::bail!(
                 "Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)"
             )
         };
 
-        let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV).await {
+        let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV) {
             Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?),
             None => None,
         };
@@ -155,18 +158,18 @@ impl Secrets {
         let this = Self {
             database_url,
             public_key,
-            jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV).await,
+            jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV),
             control_plane_jwt_token: Self::load_secret(
                 &args.control_plane_jwt_token,
                 Self::CONTROL_PLANE_JWT_TOKEN_ENV,
-            )
-            .await,
+            ),
+            peer_jwt_token: Self::load_secret(&args.peer_jwt_token, Self::PEER_JWT_TOKEN_ENV),
         };
 
         Ok(this)
     }
 
-    async fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
+    fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
         if let Some(v) = cli {
             Some(v.clone())
         } else if let Ok(v) = std::env::var(env_name) {
@@ -266,6 +269,7 @@ async fn async_main() -> anyhow::Result<()> {
     let config = Config {
         jwt_token: secrets.jwt_token,
         control_plane_jwt_token: secrets.control_plane_jwt_token,
+        peer_jwt_token: secrets.peer_jwt_token,
         compute_hook_url: args.compute_hook_url,
         max_offline_interval: args
             .max_offline_interval
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 780f4a7ee5..453e96bad3 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -288,6 +288,9 @@ pub struct Config {
     // This JWT token will be used to authenticate this service to the control plane.
     pub control_plane_jwt_token: Option<String>,
 
+    // This JWT token will be used to authenticate with other storage controller instances
+    pub peer_jwt_token: Option<String>,
+
     /// Where the compute hook should send notifications of pageserver attachment locations
     /// (this URL points to the control plane in prod). If this is None, the compute hook will
     /// assume it is running in a test environment and try to update neon_local.
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 95c35e9641..94d71a7677 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2144,6 +2144,8 @@ def test_storage_controller_leadership_transfer(
     port_distributor: PortDistributor,
     step_down_times_out: bool,
 ):
+    neon_env_builder.auth_enabled = True
+
     neon_env_builder.num_pageservers = 3
 
     neon_env_builder.storage_controller_config = {

From beefc7a8108e5af333bc1e453749acf872f18fdd Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 20 Aug 2024 19:47:42 +0100
Subject: [PATCH 13/55] pageserver: add metric
 pageserver_secondary_heatmap_total_size (#8768)

## Problem

We don't have a convenient way for a human to ask "how far are secondary
downloads along for this tenant".

This is useful when driving migrations of tenants to the storage
controller, as we first create a secondary location and want to see it
warm up before we cut over. That can already be done via storcon_cli,
but we would like a way that doesn't require direct API access.

## Summary of changes

Add a metric that reports to total size of layers in the heatmap: this
may be used in conjunction with the existing
`pageserver_secondary_resident_physical_size` to estimate "warmth" of
the secondary location.
---
 pageserver/src/metrics.rs                     |  9 +++++++++
 pageserver/src/tenant/secondary.rs            | 10 ++++++++++
 pageserver/src/tenant/secondary/downloader.rs |  6 ++++++
 3 files changed, 25 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index cd2cd43f27..1bc9352256 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1803,6 +1803,15 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::n
     .expect("failed to define a metric")
 });
 
+pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_secondary_heatmap_total_size",
+        "The total size in bytes of all layers in the most recently downloaded heatmap.",
+        &["tenant_id", "shard_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
     Upload,
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 3132a28b12..1331c07d05 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -8,6 +8,7 @@ use std::{sync::Arc, time::SystemTime};
 use crate::{
     context::RequestContext,
     disk_usage_eviction_task::DiskUsageEvictionInfo,
+    metrics::SECONDARY_HEATMAP_TOTAL_SIZE,
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
 };
 
@@ -105,6 +106,9 @@ pub(crate) struct SecondaryTenant {
 
     // Sum of layer sizes on local disk
     pub(super) resident_size_metric: UIntGauge,
+
+    // Sum of layer sizes in the most recently downloaded heatmap
+    pub(super) heatmap_total_size_metric: UIntGauge,
 }
 
 impl Drop for SecondaryTenant {
@@ -112,6 +116,7 @@ impl Drop for SecondaryTenant {
         let tenant_id = self.tenant_shard_id.tenant_id.to_string();
         let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
         let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
+        let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
     }
 }
 
@@ -128,6 +133,10 @@ impl SecondaryTenant {
             .get_metric_with_label_values(&[&tenant_id, &shard_id])
             .unwrap();
 
+        let heatmap_total_size_metric = SECONDARY_HEATMAP_TOTAL_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id])
+            .unwrap();
+
         Arc::new(Self {
             tenant_shard_id,
             // todo: shall we make this a descendent of the
@@ -145,6 +154,7 @@ impl SecondaryTenant {
             progress: std::sync::Mutex::default(),
 
             resident_size_metric,
+            heatmap_total_size_metric,
         })
     }
 
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 8cff1d2864..90e1c01dbd 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -829,6 +829,12 @@ impl<'a> TenantDownloader<'a> {
             layers_downloaded: 0,
             bytes_downloaded: 0,
         };
+
+        // Also expose heatmap bytes_total as a metric
+        self.secondary_state
+            .heatmap_total_size_metric
+            .set(heatmap_stats.bytes);
+
         // Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
         let mut delete_layers = Vec::new();
         let mut delete_timelines = Vec::new();

From c8b9116a97e047a5f349e69fda1fe96790797820 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 20 Aug 2024 15:05:33 -0400
Subject: [PATCH 14/55] impr(pageserver): abort on fatal I/O writer error
 (#8777)

part of https://github.com/neondatabase/neon/issues/8140

The blob writer path now uses `maybe_fatal_err`

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/virtual_file.rs | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index b4695e5f40..c0017280fd 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -756,11 +756,23 @@ impl VirtualFile {
         })
     }
 
+    /// The function aborts the process if the error is fatal.
     async fn write_at<B: IoBuf + Send>(
         &self,
         buf: FullSlice<B>,
         offset: u64,
         _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
+    ) -> (FullSlice<B>, Result<usize, Error>) {
+        let (slice, result) = self.write_at_inner(buf, offset, _ctx).await;
+        let result = result.maybe_fatal_err("write_at");
+        (slice, result)
+    }
+
+    async fn write_at_inner<B: IoBuf + Send>(
+        &self,
+        buf: FullSlice<B>,
+        offset: u64,
+        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
     ) -> (FullSlice<B>, Result<usize, Error>) {
         let file_guard = match self.lock_file().await {
             Ok(file_guard) => file_guard,

From 6d8572ded607e6cb583ff6b9a1690ceecce5a407 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 21 Aug 2024 09:08:49 +0200
Subject: [PATCH 15/55] Benchmarking: need to checkout actions to download Neon
 artifacts (#8770)

## Problem

Database preparation workflow needs Neon artifacts but does not checkout
necessary download action.

We were lucke in a few runs like this one

https://github.com/neondatabase/neon/actions/runs/10413970941/job/28870668020

but this is flaky and a race condition which failed here


https://github.com/neondatabase/neon/actions/runs/10446395644/job/28923749772#step:4:1



## Summary of changes

Checkout code (including actions) before invoking download action

Successful test run
https://github.com/neondatabase/neon/actions/runs/10469356296/job/28992200694
---
 .github/workflows/_benchmarking_preparation.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml
index 7229776cd6..a52e43b4da 100644
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -48,6 +48,8 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT  
 
+    - uses: actions/checkout@v4
+
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:

From 21b684718e1e3e18e687d095d79322c5db9a3992 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 21 Aug 2024 12:55:01 +0200
Subject: [PATCH 16/55] pageserver: add counter for wait time on background
 loop semaphore (#8769)

## Problem

Compaction jobs and other background loops are concurrency-limited
through a global semaphore.

The current counters allow quantifying how _many_ tasks are waiting.
But there is no way to tell how _much_ delay is added by the semaphore.

So, add a counter that aggregates the wall clock time seconds spent
acquiring the semaphore.

The metrics can be used as follows:

* retroactively calculate average acquisition time in a given time range
* compare the degree of background loop backlog among pageservers

The metric is insufficient to calculate

* run-up of ongoing acquisitions that haven't finished acquiring yet
* Not easily feasible because ["Cancelling a call to acquire makes you
lose your place in the
queue"](https://docs.rs/tokio/latest/tokio/sync/struct.Semaphore.html#method.acquire)

## Summary of changes

* Refactor the metrics to follow the current best practice for typed
metrics in `metrics.rs`.
* Add the new counter.
---
 pageserver/src/metrics.rs      | 69 +++++++++++++++++++++++++++++-----
 pageserver/src/tenant/tasks.rs | 11 +-----
 2 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 1bc9352256..0a1a22b6e8 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1862,16 +1862,64 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
     .expect("Failed to register tenant_task_events metric")
 });
 
-pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "pageserver_background_loop_semaphore_wait_start_count",
-        "Counter for background loop concurrency-limiting semaphore acquire calls started",
-        "pageserver_background_loop_semaphore_wait_finish_count",
-        "Counter for background loop concurrency-limiting semaphore acquire calls finished",
-        &["task"],
-    )
-    .unwrap()
-});
+pub struct BackgroundLoopSemaphoreMetrics {
+    counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
+    durations: EnumMap<BackgroundLoopKind, Counter>,
+}
+
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
+    || {
+        let counters = register_int_counter_pair_vec!(
+            "pageserver_background_loop_semaphore_wait_start_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls started",
+            "pageserver_background_loop_semaphore_wait_finish_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls finished",
+            &["task"],
+        )
+        .unwrap();
+
+        let durations = register_counter_vec!(
+            "pageserver_background_loop_semaphore_wait_duration_seconds",
+            "Sum of wall clock time spent waiting on the background loop concurrency-limiting semaphore acquire calls",
+            &["task"],
+        )
+        .unwrap();
+
+        BackgroundLoopSemaphoreMetrics {
+            counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+                counters.with_label_values(&[kind.into()])
+            })),
+            durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+                durations.with_label_values(&[kind.into()])
+            })),
+        }
+    },
+);
+
+impl BackgroundLoopSemaphoreMetrics {
+    pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ {
+        struct Record<'a> {
+            metrics: &'a BackgroundLoopSemaphoreMetrics,
+            task: BackgroundLoopKind,
+            _counter_guard: metrics::IntCounterPairGuard,
+            start: Instant,
+        }
+        impl Drop for Record<'_> {
+            fn drop(&mut self) {
+                let elapsed = self.start.elapsed().as_secs_f64();
+                self.metrics.durations[self.task].inc_by(elapsed);
+            }
+        }
+        Record {
+            metrics: self,
+            task,
+            _counter_guard: self.counters[task].guard(),
+            start: Instant::now(),
+        }
+    }
+}
 
 pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
@@ -2553,6 +2601,7 @@ use std::time::{Duration, Instant};
 use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;
 use crate::tenant::mgr::TenantSlot;
+use crate::tenant::tasks::BackgroundLoopKind;
 
 /// Maintain a per timeline gauge in addition to the global gauge.
 pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 3972685a8e..12f080f3c1 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -61,21 +61,12 @@ impl BackgroundLoopKind {
     }
 }
 
-static PERMIT_GAUGES: once_cell::sync::Lazy<
-    enum_map::EnumMap<BackgroundLoopKind, metrics::IntCounterPair>,
-> = once_cell::sync::Lazy::new(|| {
-    enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-        let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
-        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE.with_label_values(&[kind.into()])
-    }))
-});
-
 /// Cancellation safe.
 pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
     loop_kind: BackgroundLoopKind,
     _ctx: &RequestContext,
 ) -> tokio::sync::SemaphorePermit<'static> {
-    let _guard = PERMIT_GAUGES[loop_kind].guard();
+    let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind);
 
     pausable_failpoint!(
         "initial-size-calculation-permit-pause",

From 477246f42cf984015d654521174fff763f9e1263 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 21 Aug 2024 14:28:27 +0300
Subject: [PATCH 17/55] storcon: handle heartbeater shutdown gracefully (#8767)

if a heartbeat happens during shutdown, then the task is already
cancelled and will not be sending responses.

Fixes: #8766
---
 storage_controller/src/heartbeater.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index 1bb9c17f30..c0e27bafdb 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -87,9 +87,12 @@ impl Heartbeater {
                 pageservers,
                 reply: sender,
             })
-            .unwrap();
+            .map_err(|_| HeartbeaterError::Cancel)?;
 
-        receiver.await.unwrap()
+        receiver
+            .await
+            .map_err(|_| HeartbeaterError::Cancel)
+            .and_then(|x| x)
     }
 }
 

From 3b8016488efc4cecee1a956285b3365162706894 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 21 Aug 2024 14:51:08 +0300
Subject: [PATCH 18/55] test: test_timeline_ancestor_detach_errors rare
 allowed_error (#8782)

Add another allowed_error for this rarity.

Fixes: #8773
---
 test_runner/regress/test_timeline_detach_ancestor.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 82fc26126d..d152d0f41f 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -639,8 +639,12 @@ def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, shard
 
     for ps in pageservers.values():
         ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
-        ps.allowed_errors.append(
-            ".* WARN .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: request was dropped before completing"
+        ps.allowed_errors.extend(
+            [
+                ".* WARN .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: request was dropped before completing",
+                # rare error logging, which is hard to reproduce without instrumenting responding with random sleep
+                '.* ERROR .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: Cancelled request finished with an error: Conflict\\("no ancestors"\\)',
+            ]
         )
 
     client = (

From 75175f3628bc88053e13555a3ada8082639b2db6 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 21 Aug 2024 14:29:11 +0100
Subject: [PATCH 19/55] CI(build-and-test): run regression tests on arm (#8552)

## Problem

We want to run our regression test suite on ARM.

## Summary of changes
- run regression tests on release ARM builds
- run `build-neon` (including rust tests) on debug ARM builds
- add `arch` parameter to test to distinguish them in the allure report
and in a database
---
 .../actions/run-python-test-set/action.yml    |  4 +---
 .github/workflows/_build-and-test-locally.yml | 15 +++++++++----
 .github/workflows/build_and_test.yml          |  2 +-
 .../ingest_regress_test_result-new-format.py  | 14 ++++++++++++-
 test_runner/fixtures/parametrize.py           | 21 +++++++++++++++++++
 5 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 814067fb8e..6c2cee0971 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -169,10 +169,8 @@ runs:
           EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
         fi
 
-        if [[ "${{ inputs.build_type }}" == "debug" ]]; then
+        if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then
           cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
-        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
-          cov_prefix=()
         else
           cov_prefix=()
         fi
diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index af76e51ebc..5e9fff0e6a 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -94,11 +94,16 @@ jobs:
       # We run tests with addtional features, that are turned off by default (e.g. in release builds), see
       # corresponding Cargo.toml files for their descriptions.
       - name: Set env variables
+        env:
+          ARCH: ${{ inputs.arch }}
         run: |
           CARGO_FEATURES="--features testing"
-          if [[ $BUILD_TYPE == "debug" ]]; then
+          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
             cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
             CARGO_FLAGS="--locked"
+          elif [[ $BUILD_TYPE == "debug" ]]; then
+            cov_prefix=""
+            CARGO_FLAGS="--locked"
           elif [[ $BUILD_TYPE == "release" ]]; then
             cov_prefix=""
             CARGO_FLAGS="--locked --release"
@@ -158,6 +163,8 @@ jobs:
       # Do install *before* running rust tests because they might recompile the
       # binaries with different features/flags.
       - name: Install rust binaries
+        env:
+          ARCH: ${{ inputs.arch }}
         run: |
           # Install target binaries
           mkdir -p /tmp/neon/bin/
@@ -172,7 +179,7 @@ jobs:
           done
 
           # Install test executables and write list of all binaries (for code coverage)
-          if [[ $BUILD_TYPE == "debug" ]]; then
+          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
             # Keep bloated coverage data files away from the rest of the artifact
             mkdir -p /tmp/coverage/
 
@@ -243,8 +250,8 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   regress-tests:
-    # Run test on x64 only
-    if: inputs.arch == 'x64'
+    # Don't run regression tests on debug arm64 builds
+    if: inputs.build-type != 'debug' || inputs.arch != 'arm64'
     needs: [ build-neon ]
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
     container:
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 715f1af656..1e7f3598c2 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -198,7 +198,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        arch: [ x64 ]
+        arch: [ x64, arm64 ]
         # Do not build or run tests in debug for release branches
         build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }}
         include:
diff --git a/scripts/ingest_regress_test_result-new-format.py b/scripts/ingest_regress_test_result-new-format.py
index cff1d9875f..40d7254e00 100644
--- a/scripts/ingest_regress_test_result-new-format.py
+++ b/scripts/ingest_regress_test_result-new-format.py
@@ -18,6 +18,7 @@ import psycopg2
 from psycopg2.extras import execute_values
 
 CREATE_TABLE = """
+CREATE TYPE arch AS ENUM ('ARM64', 'X64', 'UNKNOWN');
 CREATE TABLE IF NOT EXISTS results (
     id           BIGSERIAL PRIMARY KEY,
     parent_suite TEXT NOT NULL,
@@ -28,6 +29,7 @@ CREATE TABLE IF NOT EXISTS results (
     stopped_at   TIMESTAMPTZ NOT NULL,
     duration     INT NOT NULL,
     flaky        BOOLEAN NOT NULL,
+    arch         arch DEFAULT 'X64',
     build_type   TEXT NOT NULL,
     pg_version   INT NOT NULL,
     run_id       BIGINT NOT NULL,
@@ -35,7 +37,7 @@ CREATE TABLE IF NOT EXISTS results (
     reference    TEXT NOT NULL,
     revision     CHAR(40) NOT NULL,
     raw          JSONB COMPRESSION lz4 NOT NULL,
-    UNIQUE (parent_suite, suite, name, build_type, pg_version, started_at, stopped_at, run_id)
+    UNIQUE (parent_suite, suite, name, arch, build_type, pg_version, started_at, stopped_at, run_id)
 );
 """
 
@@ -50,6 +52,7 @@ class Row:
     stopped_at: datetime
     duration: int
     flaky: bool
+    arch: str
     build_type: str
     pg_version: int
     run_id: int
@@ -121,6 +124,14 @@ def ingest_test_result(
         raw.pop("labels")
         raw.pop("extra")
 
+        # All allure parameters are prefixed with "__", see test_runner/fixtures/parametrize.py
+        parameters = {
+            p["name"].removeprefix("__"): p["value"]
+            for p in test["parameters"]
+            if p["name"].startswith("__")
+        }
+        arch = parameters.get("arch", "UNKNOWN").strip("'")
+
         build_type, pg_version, unparametrized_name = parse_test_name(test["name"])
         labels = {label["name"]: label["value"] for label in test["labels"]}
         row = Row(
@@ -132,6 +143,7 @@ def ingest_test_result(
             stopped_at=datetime.fromtimestamp(test["time"]["stop"] / 1000, tz=timezone.utc),
             duration=test["time"]["duration"],
             flaky=test["flaky"] or test["retriesStatusChange"],
+            arch=arch,
             build_type=build_type,
             pg_version=pg_version,
             run_id=run_id,
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 0227285822..92c98763e3 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -1,6 +1,7 @@
 import os
 from typing import Any, Dict, Optional
 
+import allure
 import pytest
 import toml
 from _pytest.python import Metafunc
@@ -91,3 +92,23 @@ def pytest_generate_tests(metafunc: Metafunc):
         and (platform := os.getenv("PLATFORM")) is not None
     ):
         metafunc.parametrize("platform", [platform.lower()])
+
+
+@pytest.hookimpl(hookwrapper=True, tryfirst=True)
+def pytest_runtest_makereport(*args, **kwargs):
+    # Add test parameters to Allue report to distinguish the same tests with different parameters.
+    # Names has `__` prefix to avoid conflicts with `pytest.mark.parametrize` parameters
+
+    # A mapping between `uname -m` and `RUNNER_ARCH` values.
+    # `RUNNER_ARCH` environment variable is set on GitHub Runners,
+    # possible values are X86, X64, ARM, or ARM64.
+    # See https://docs.github.com/en/actions/learn-github-actions/variables#default-environment-variables
+    uname_m = {
+        "aarch64": "ARM64",
+        "arm64": "ARM64",
+        "x86_64": "X64",
+    }.get(os.uname().machine, "UNKNOWN")
+    arch = os.getenv("RUNNER_ARCH", uname_m)
+    allure.dynamic.parameter("__arch", arch)
+
+    yield

From 428b105dde089d402b1de035a8cb43ebea930583 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 21 Aug 2024 14:45:32 +0100
Subject: [PATCH 20/55] remove workspace hack from libs (#8780)

This removes workspace hack from all libs, not from any binaries. This
does not change the behaviour of the hack.

Running
```
cargo clean
cargo build --release --bin proxy
```

Before this change took 5m16s. After this change took 3m3s. This is
because this allows the build to be parallelisable much more.
---
 .config/hakari.toml                    | 28 ++++++++++++++++++++++----
 Cargo.lock                             | 18 -----------------
 libs/compute_api/Cargo.toml            |  2 --
 libs/consumption_metrics/Cargo.toml    |  4 +---
 libs/desim/Cargo.toml                  |  2 --
 libs/metrics/Cargo.toml                |  2 --
 libs/pageserver_api/Cargo.toml         |  4 +---
 libs/postgres_backend/Cargo.toml       |  1 -
 libs/postgres_connection/Cargo.toml    |  2 --
 libs/postgres_ffi/Cargo.toml           |  2 --
 libs/postgres_ffi/wal_craft/Cargo.toml |  2 --
 libs/pq_proto/Cargo.toml               |  4 +---
 libs/remote_storage/Cargo.toml         |  3 ++-
 libs/safekeeper_api/Cargo.toml         |  2 --
 libs/tenant_size_model/Cargo.toml      |  2 --
 libs/tracing-utils/Cargo.toml          |  2 --
 libs/utils/Cargo.toml                  |  4 ++--
 libs/walproposer/Cargo.toml            |  2 --
 workspace_hack/Cargo.toml              |  3 ---
 19 files changed, 31 insertions(+), 58 deletions(-)

diff --git a/.config/hakari.toml b/.config/hakari.toml
index 9913ecc9c0..b5990d090e 100644
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -23,10 +23,30 @@ platforms = [
 ]
 
 [final-excludes]
-# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
-# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
-# from depending on workspace-hack because most of the dependencies are not used.
-workspace-members = ["vm_monitor"]
+workspace-members = [
+    # vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
+    # it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
+    # from depending on workspace-hack because most of the dependencies are not used.
+    "vm_monitor",
+    # All of these exist in libs and are not usually built independently.
+    # Putting workspace hack there adds a bottleneck for cargo builds.
+    "compute_api",
+    "consumption_metrics",
+    "desim",
+    "metrics",
+    "pageserver_api",
+    "postgres_backend",
+    "postgres_connection",
+    "postgres_ffi",
+    "pq_proto",
+    "remote_storage",
+    "safekeeper_api",
+    "tenant_size_model",
+    "tracing-utils",
+    "utils",
+    "wal_craft",
+    "walproposer",
+]
 
 # Write out exact versions rather than a semver range. (Defaults to false.)
 # exact-versions = true
diff --git a/Cargo.lock b/Cargo.lock
index dee15b6aa7..a506da8c02 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1208,7 +1208,6 @@ dependencies = [
  "serde_json",
  "serde_with",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -1321,7 +1320,6 @@ dependencies = [
  "serde",
  "serde_with",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -1670,7 +1668,6 @@ dependencies = [
  "smallvec",
  "tracing",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -3147,7 +3144,6 @@ dependencies = [
  "rand 0.8.5",
  "rand_distr",
  "twox-hash",
- "workspace_hack",
 ]
 
 [[package]]
@@ -3791,7 +3787,6 @@ dependencies = [
  "strum_macros",
  "thiserror",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -4193,7 +4188,6 @@ dependencies = [
  "tokio-rustls 0.25.0",
  "tokio-util",
  "tracing",
- "workspace_hack",
 ]
 
 [[package]]
@@ -4206,7 +4200,6 @@ dependencies = [
  "postgres",
  "tokio-postgres",
  "url",
- "workspace_hack",
 ]
 
 [[package]]
@@ -4229,7 +4222,6 @@ dependencies = [
  "serde",
  "thiserror",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -4267,7 +4259,6 @@ dependencies = [
  "thiserror",
  "tokio",
  "tracing",
- "workspace_hack",
 ]
 
 [[package]]
@@ -4832,7 +4823,6 @@ dependencies = [
  "toml_edit 0.19.10",
  "tracing",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -5357,7 +5347,6 @@ dependencies = [
  "serde",
  "serde_with",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -6193,7 +6182,6 @@ dependencies = [
  "anyhow",
  "serde",
  "serde_json",
- "workspace_hack",
 ]
 
 [[package]]
@@ -6794,7 +6782,6 @@ dependencies = [
  "tracing",
  "tracing-opentelemetry",
  "tracing-subscriber",
- "workspace_hack",
 ]
 
 [[package]]
@@ -7012,7 +6999,6 @@ dependencies = [
  "url",
  "uuid",
  "walkdir",
- "workspace_hack",
 ]
 
 [[package]]
@@ -7091,7 +7077,6 @@ dependencies = [
  "postgres_ffi",
  "regex",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -7112,7 +7097,6 @@ dependencies = [
  "bindgen",
  "postgres_ffi",
  "utils",
- "workspace_hack",
 ]
 
 [[package]]
@@ -7669,8 +7653,6 @@ dependencies = [
  "tokio",
  "tokio-rustls 0.24.0",
  "tokio-util",
- "toml_datetime",
- "toml_edit 0.19.10",
  "tonic",
  "tower",
  "tracing",
diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml
index b377bd2cce..8aaa481f8c 100644
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -14,5 +14,3 @@ regex.workspace = true
 
 utils = { path = "../utils" }
 remote_storage = { version = "0.1", path = "../remote_storage/" }
-
-workspace_hack.workspace = true
diff --git a/libs/consumption_metrics/Cargo.toml b/libs/consumption_metrics/Cargo.toml
index 3f290821c2..a40b74b952 100644
--- a/libs/consumption_metrics/Cargo.toml
+++ b/libs/consumption_metrics/Cargo.toml
@@ -6,10 +6,8 @@ license = "Apache-2.0"
 
 [dependencies]
 anyhow.workspace = true
-chrono.workspace = true
+chrono = { workspace = true, features = ["serde"] }
 rand.workspace = true
 serde.workspace = true
 serde_with.workspace = true
 utils.workspace = true
-
-workspace_hack.workspace = true
diff --git a/libs/desim/Cargo.toml b/libs/desim/Cargo.toml
index 6f442d8243..0c4be90267 100644
--- a/libs/desim/Cargo.toml
+++ b/libs/desim/Cargo.toml
@@ -14,5 +14,3 @@ parking_lot.workspace = true
 hex.workspace = true
 scopeguard.workspace = true
 smallvec = { workspace = true, features = ["write"] }
-
-workspace_hack.workspace = true
diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml
index 0bd804051c..f87e7b8e3a 100644
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -12,8 +12,6 @@ chrono.workspace = true
 twox-hash.workspace = true
 measured.workspace = true
 
-workspace_hack.workspace = true
-
 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
 measured-process.workspace = true
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 3bba89c76d..cb28359ac3 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -21,11 +21,9 @@ hex.workspace = true
 humantime.workspace = true
 thiserror.workspace = true
 humantime-serde.workspace = true
-chrono.workspace = true
+chrono = { workspace = true, features = ["serde"] }
 itertools.workspace = true
 
-workspace_hack.workspace = true
-
 [dev-dependencies]
 bincode.workspace = true
 rand.workspace = true
diff --git a/libs/postgres_backend/Cargo.toml b/libs/postgres_backend/Cargo.toml
index c7611b9f21..f6854328fc 100644
--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -18,7 +18,6 @@ tokio-rustls.workspace = true
 tracing.workspace = true
 
 pq_proto.workspace = true
-workspace_hack.workspace = true
 
 [dev-dependencies]
 once_cell.workspace = true
diff --git a/libs/postgres_connection/Cargo.toml b/libs/postgres_connection/Cargo.toml
index fbfea80ae2..19027d13ff 100644
--- a/libs/postgres_connection/Cargo.toml
+++ b/libs/postgres_connection/Cargo.toml
@@ -11,7 +11,5 @@ postgres.workspace = true
 tokio-postgres.workspace = true
 url.workspace = true
 
-workspace_hack.workspace = true
-
 [dev-dependencies]
 once_cell.workspace = true
diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml
index 86e72f6bdd..ee69878f69 100644
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -19,8 +19,6 @@ thiserror.workspace = true
 serde.workspace = true
 utils.workspace = true
 
-workspace_hack.workspace = true
-
 [dev-dependencies]
 env_logger.workspace = true
 postgres.workspace = true
diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml
index 0edc642402..29dd01a936 100644
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -14,8 +14,6 @@ postgres.workspace = true
 postgres_ffi.workspace = true
 camino-tempfile.workspace = true
 
-workspace_hack.workspace = true
-
 [dev-dependencies]
 regex.workspace = true
 utils.workspace = true
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index 8afabe670e..66bbe03ebc 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -11,9 +11,7 @@ itertools.workspace = true
 pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
-tokio.workspace = true
+tokio = { workspace = true, features = ["io-util"] }
 tracing.workspace = true
 thiserror.workspace = true
 serde.workspace = true
-
-workspace_hack.workspace = true
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 414bce1b26..02adee058f 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -32,7 +32,7 @@ scopeguard.workspace = true
 metrics.workspace = true
 utils.workspace = true
 pin-project-lite.workspace = true
-workspace_hack.workspace = true
+
 azure_core.workspace = true
 azure_identity.workspace = true
 azure_storage.workspace = true
@@ -46,3 +46,4 @@ sync_wrapper = { workspace = true, features = ["futures"] }
 camino-tempfile.workspace = true
 test-context.workspace = true
 rand.workspace = true
+tokio = { workspace = true, features = ["test-util"] }
diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml
index 327d98ee77..e1f4bcca46 100644
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -9,5 +9,3 @@ serde.workspace = true
 serde_with.workspace = true
 const_format.workspace = true
 utils.workspace = true
-
-workspace_hack.workspace = true
diff --git a/libs/tenant_size_model/Cargo.toml b/libs/tenant_size_model/Cargo.toml
index 15e78932a8..8aa3c54f62 100644
--- a/libs/tenant_size_model/Cargo.toml
+++ b/libs/tenant_size_model/Cargo.toml
@@ -9,5 +9,3 @@ license.workspace = true
 anyhow.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-
-workspace_hack.workspace = true
diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml
index 512a748124..5ea8db6b42 100644
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -14,5 +14,3 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
-
-workspace_hack.workspace = true
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index ec05f849cf..6e593eeac1 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -39,7 +39,7 @@ thiserror.workspace = true
 tokio.workspace = true
 tokio-tar.workspace = true
 tokio-util.workspace = true
-toml_edit.workspace = true
+toml_edit = { workspace = true, features = ["serde"] }
 tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
@@ -54,7 +54,6 @@ walkdir.workspace = true
 pq_proto.workspace = true
 postgres_connection.workspace = true
 metrics.workspace = true
-workspace_hack.workspace = true
 
 const_format.workspace = true
 
@@ -71,6 +70,7 @@ criterion.workspace = true
 hex-literal.workspace = true
 camino-tempfile.workspace = true
 serde_assert.workspace = true
+tokio = { workspace = true, features = ["test-util"] }
 
 [[bench]]
 name = "benchmarks"
diff --git a/libs/walproposer/Cargo.toml b/libs/walproposer/Cargo.toml
index 73aa073c44..2d442dc429 100644
--- a/libs/walproposer/Cargo.toml
+++ b/libs/walproposer/Cargo.toml
@@ -9,8 +9,6 @@ anyhow.workspace = true
 utils.workspace = true
 postgres_ffi.workspace = true
 
-workspace_hack.workspace = true
-
 [build-dependencies]
 anyhow.workspace = true
 bindgen.workspace = true
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 2d9b372654..20693ad63d 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -80,8 +80,6 @@ time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }
-toml_datetime = { version = "0.6", default-features = false, features = ["serde"] }
-toml_edit = { version = "0.19", features = ["serde"] }
 tonic = { version = "0.9", features = ["tls-roots"] }
 tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
@@ -124,7 +122,6 @@ serde = { version = "1", features = ["alloc", "derive"] }
 syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] }
 syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
 time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }
-toml_datetime = { version = "0.6", default-features = false, features = ["serde"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }

From f4b3c317f394cb7f82c8c52754b290903957e85d Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 19 Aug 2024 16:34:04 -0500
Subject: [PATCH 21/55] Add compute_logical_snapshot_files metric

Track the number of logical snapshot files on an endpoint over time.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vm-image-spec.yaml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 41d6e11725..8c1c4512b4 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -312,6 +312,22 @@ files:
         query: |
           SELECT checkpoints_timed FROM pg_stat_bgwriter;
 
+      - metric_name: compute_logical_snapshot_files
+        type: guage
+        help: 'Number of snapshot files in pg_logical/snapshot'
+        key_labels:
+          - tenant_id
+          - timeline_id
+        values: [num_logical_snapshot_files]
+        query: |
+          SELECT
+            (SELECT setting FROM pg_settings WHERE name = 'neon.tenant_id') AS tenant_id,
+            (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
+            -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
+            -- temporary snapshot files are renamed to the actual snapshot files after they are
+            -- completely built. We only WAL-log the completely built snapshot files.
+            (SELECT COUNT(*) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
+
       # In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
       # It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
 

From d919770c55b2a70fd0b19c888d3673b6fef2f889 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 21 Aug 2024 17:30:42 +0300
Subject: [PATCH 22/55] safekeeper: add listing timelines

Adds endpoint GET /tenant/timeline listing all not deleted timelines.
---
 safekeeper/src/http/routes.rs            | 13 +++++++++++++
 test_runner/fixtures/common_types.py     | 15 ++++++++++++++-
 test_runner/fixtures/safekeeper/http.py  |  8 +++++++-
 test_runner/regress/test_wal_acceptor.py |  4 ++++
 4 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index c9defb0bcf..d11815f6ef 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -114,6 +114,16 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
     })
 }
 
+/// List all (not deleted) timelines.
+async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    let res: Vec<TenantTimelineId> = GlobalTimelines::get_all()
+        .iter()
+        .map(|tli| tli.ttid)
+        .collect();
+    json_response(StatusCode::OK, res)
+}
+
 /// Report info about timeline.
 async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let ttid = TenantTimelineId::new(
@@ -562,6 +572,9 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
         .post("/v1/tenant/timeline", |r| {
             request_span(r, timeline_create_handler)
         })
+        .get("/v1/tenant/timeline", |r| {
+            request_span(r, timeline_list_handler)
+        })
         .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
             request_span(r, timeline_status_handler)
         })
diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py
index b63dfd4e47..7cadcbb4c2 100644
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -1,7 +1,7 @@
 import random
 from dataclasses import dataclass
 from functools import total_ordering
-from typing import Any, Type, TypeVar, Union
+from typing import Any, Dict, Type, TypeVar, Union
 
 T = TypeVar("T", bound="Id")
 
@@ -147,6 +147,19 @@ class TimelineId(Id):
         return self.id.hex()
 
 
+@dataclass
+class TenantTimelineId:
+    tenant_id: TenantId
+    timeline_id: TimelineId
+
+    @classmethod
+    def from_json(cls, d: Dict[str, Any]) -> "TenantTimelineId":
+        return TenantTimelineId(
+            tenant_id=TenantId(d["tenant_id"]),
+            timeline_id=TimelineId(d["timeline_id"]),
+        )
+
+
 # Workaround for compat with python 3.9, which does not have `typing.Self`
 TTenantShardId = TypeVar("TTenantShardId", bound="TenantShardId")
 
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index a51b89744b..dd3a0a3d54 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -5,7 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 import pytest
 import requests
 
-from fixtures.common_types import Lsn, TenantId, TimelineId
+from fixtures.common_types import Lsn, TenantId, TenantTimelineId, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 
@@ -144,6 +144,12 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json, dict)
         return res_json
 
+    def timeline_list(self) -> List[TenantTimelineId]:
+        res = self.get(f"http://localhost:{self.port}/v1/tenant/timeline")
+        res.raise_for_status()
+        resj = res.json()
+        return [TenantTimelineId.from_json(ttidj) for ttidj in resj]
+
     def timeline_create(
         self,
         tenant_id: TenantId,
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 5d3b263936..bb3b16f3e1 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -254,6 +254,10 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder):
     assert max(init_m[2].flush_lsns) <= min(final_m[2].flush_lsns) < middle_lsn
     assert max(init_m[2].commit_lsns) <= min(final_m[2].commit_lsns) < middle_lsn
 
+    # Test timeline_list endpoint.
+    http_cli = env.safekeepers[0].http_client()
+    assert len(http_cli.timeline_list()) == 3
+
 
 # Check that dead minority doesn't prevent the commits: execute insert n_inserts
 # times, with fault_probability chance of getting a wal acceptor down or up

From b83d722369f1cb1d9a55ab8d39c36f30b0886ea4 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 21 Aug 2024 19:22:47 +0300
Subject: [PATCH 23/55] test: fix more flaky due to graceful shutdown (#8787)

Going through the list of recent flaky tests, trying to fix those
related to graceful shutdown.

- test_forward_compatibility: flush and wait for uploads to avoid
graceful shutdown
- test_layer_bloating: in the end the endpoint and vanilla are still up
=> immediate shutdown
- test_lagging_sk: pageserver shutdown is not related to the test =>
immediate shutdown
- test_lsn_lease_size: pageserver flushing is not needed => immediate
shutdown

Additionally:
- remove `wait_for_upload` usage from workload fixture

Cc: #8708
Fixes: #8710
---
 test_runner/fixtures/neon_fixtures.py      | 14 +++---------
 test_runner/fixtures/workload.py           |  7 +++---
 test_runner/regress/test_compatibility.py  | 12 ++++------
 test_runner/regress/test_import.py         |  5 ++---
 test_runner/regress/test_layer_bloating.py | 26 +++++++++++++---------
 test_runner/regress/test_tenant_size.py    |  3 +++
 test_runner/regress/test_wal_acceptor.py   |  2 ++
 7 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 9aa275d343..2bb698f175 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -61,8 +61,6 @@ from fixtures.pageserver.common_types import IndexPartDump, LayerName, parse_lay
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
-    wait_for_upload,
-    wait_for_upload_queue_empty,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
@@ -5347,9 +5345,7 @@ def last_flush_lsn_upload(
     for tenant_shard_id, pageserver in shards:
         ps_http = pageserver.http_client(auth_token=auth_token)
         wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, last_flush_lsn)
-        # force a checkpoint to trigger upload
-        ps_http.timeline_checkpoint(tenant_shard_id, timeline_id)
-        wait_for_upload(ps_http, tenant_shard_id, timeline_id, last_flush_lsn)
+        ps_http.timeline_checkpoint(tenant_shard_id, timeline_id, wait_until_uploaded=True)
     return last_flush_lsn
 
 
@@ -5434,9 +5430,5 @@ def generate_uploads_and_deletions(
         # ensures that the pageserver is in a fully idle state: there will be no more
         # background ingest, no more uploads pending, and therefore no non-determinism
         # in subsequent actions like pageserver restarts.
-        final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id)
-        ps_http.timeline_checkpoint(tenant_id, timeline_id)
-        # Finish uploads
-        wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn)
-        # Finish all remote writes (including deletions)
-        wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
+        flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id)
+        ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index cc93762175..065a78bf9b 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -10,7 +10,7 @@ from fixtures.neon_fixtures import (
     tenant_get_shards,
     wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
+from fixtures.pageserver.utils import wait_for_last_record_lsn
 
 # neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex
 # to ensure we don't do that: this enables running lots of Workloads in parallel safely.
@@ -174,8 +174,9 @@ class Workload:
 
                 if upload:
                     # Wait for written data to be uploaded to S3 (force a checkpoint to trigger upload)
-                    ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id)
-                    wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
+                    ps_http.timeline_checkpoint(
+                        tenant_shard_id, self.timeline_id, wait_until_uploaded=True
+                    )
                     log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
                 else:
                     log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 30ff40b7df..de27191945 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -9,14 +9,12 @@ from typing import List, Optional
 
 import pytest
 import toml
-from fixtures.common_types import Lsn, TenantId, TimelineId
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, flush_ep_to_pageserver
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
-    wait_for_last_record_lsn,
-    wait_for_upload,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
@@ -122,11 +120,9 @@ def test_create_snapshot(
     timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id]
 
     pageserver_http = env.pageserver.http_client()
-    lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
 
-    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn)
-    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-    wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
+    flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id)
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
 
     env.endpoints.stop_all()
     for sk in env.safekeepers:
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 4dae9176b8..4385cfca76 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -18,7 +18,6 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
     wait_for_last_record_lsn,
-    wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind
 from fixtures.utils import assert_pageserver_backups_equal, subprocess_capture
@@ -144,7 +143,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
 
     # Wait for data to land in s3
     wait_for_last_record_lsn(client, tenant, timeline, Lsn(end_lsn))
-    wait_for_upload(client, tenant, timeline, Lsn(end_lsn))
+    client.timeline_checkpoint(tenant, timeline, compact=False, wait_until_uploaded=True)
 
     # Check it worked
     endpoint = env.endpoints.create_start(branch_name, tenant_id=tenant)
@@ -290,7 +289,7 @@ def _import(
 
     # Wait for data to land in s3
     wait_for_last_record_lsn(client, tenant, timeline, lsn)
-    wait_for_upload(client, tenant, timeline, lsn)
+    client.timeline_checkpoint(tenant, timeline, compact=False, wait_until_uploaded=True)
 
     # Check it worked
     endpoint = env.endpoints.create_start(branch_name, tenant_id=tenant, lsn=lsn)
diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py
index 77dc8a35b5..b8126395fd 100644
--- a/test_runner/regress/test_layer_bloating.py
+++ b/test_runner/regress/test_layer_bloating.py
@@ -1,27 +1,31 @@
 import os
-import time
 
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
-    NeonEnv,
+    NeonEnvBuilder,
     logical_replication_sync,
     wait_for_last_flush_lsn,
 )
 from fixtures.pg_version import PgVersion
 
 
-def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg):
-    env = neon_simple_env
-
-    if env.pg_version != PgVersion.V16:
+def test_layer_bloating(neon_env_builder: NeonEnvBuilder, vanilla_pg):
+    if neon_env_builder.pg_version != PgVersion.V16:
         pytest.skip("pg_log_standby_snapshot() function is available only in PG16")
 
-    timeline = env.neon_cli.create_branch("test_logical_replication", "empty")
-    endpoint = env.endpoints.create_start(
-        "test_logical_replication", config_lines=["log_statement=all"]
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            "compaction_threshold": 99999,
+            "image_creation_threshold": 99999,
+        }
     )
 
+    timeline = env.initial_timeline
+    endpoint = env.endpoints.create_start("main", config_lines=["log_statement=all"])
+
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
@@ -54,7 +58,7 @@ def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg):
     # Wait logical replication to sync
     logical_replication_sync(vanilla_pg, endpoint)
     wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline)
-    time.sleep(10)
+    env.pageserver.http_client().timeline_checkpoint(env.initial_tenant, timeline, compact=False)
 
     # Check layer file sizes
     timeline_path = f"{env.pageserver.workdir}/tenants/{env.initial_tenant}/timelines/{timeline}/"
@@ -63,3 +67,5 @@ def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg):
         if filename.startswith("00000"):
             log.info(f"layer {filename} size is {os.path.getsize(timeline_path + filename)}")
             assert os.path.getsize(timeline_path + filename) < 512_000_000
+
+    env.stop(immediate=True)
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index b1ade77a14..f872116a1c 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -757,6 +757,9 @@ def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path,
 
     assert_size_approx_equal_for_lease_test(lease_res, ro_branch_res)
 
+    # we are writing a lot, and flushing all of that to disk is not important for this test
+    env.stop(immediate=True)
+
 
 def insert_with_action(
     env: NeonEnv,
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index bb3b16f3e1..19df834b81 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1300,6 +1300,8 @@ def test_lagging_sk(neon_env_builder: NeonEnvBuilder):
     # Check that WALs are the same.
     cmp_sk_wal([sk1, sk2, sk3], tenant_id, timeline_id)
 
+    env.stop(immediate=True)
+
 
 # Smaller version of test_one_sk_down testing peer recovery in isolation: that
 # it works without compute at all.

From 99c19cad24b5bb5974403a1e2541fe28ac4c0d53 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 16 Aug 2024 12:44:12 -0500
Subject: [PATCH 24/55] Add compute_receive_lsn metric

Useful for dashboarding the replication metrics of a single endpoint.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vm-image-spec.yaml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 8c1c4512b4..d1cfbda15d 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -272,6 +272,19 @@ files:
               else (pg_current_wal_lsn() - '0/0')::FLOAT8
             end as lsn;
 
+      - metric_name: compute_receive_lsn
+        type: gauge
+        help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication'
+        key_labels:
+        values: [lsn]
+        query: |
+          SELECT
+            CASE
+              WHEN pg_catalog.pg_is_in_recovery()
+              THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
+              ELSE 0
+            END AS lsn;
+
       - metric_name: replication_delay_bytes
         type: gauge
         help: 'Bytes between received and replayed LSN'

From 04752dfa757472062cb70f0fa1fa2e5ccff89225 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 21 Aug 2024 11:15:18 -0500
Subject: [PATCH 25/55] Prefix current_lsn with compute_

---
 vm-image-spec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index d1cfbda15d..622004b931 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -259,7 +259,7 @@ files:
           from
             (values ('5m'),('15m'),('1h')) as t (x);
 
-      - metric_name: current_lsn
+      - metric_name: compute_current_lsn
         type: gauge
         help: 'Current LSN of the database'
         key_labels:

From 07b7c63975fbfaf60f28176b275c4d57e28a8e04 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 21 Aug 2024 21:26:27 +0300
Subject: [PATCH 26/55] test: avoid some too long shutdowns by flushing before
 shutdown (#8772)

After #8655, we needed to mark some tests to shut down immediately. To
aid these tests, try the new pattern of `flush_ep_to_pageserver`
followed by a non-compacting checkpoint. This moves the general graceful
shutdown problem of having too much to flush at shutdown into the test.
Also, add logging for how long the graceful shutdown took, if we got to
complete it for faster log eyeballing.

Fixes: #8712
Cc: #8715, #8708
---
 pageserver/src/lib.rs                         |  7 +++++-
 .../pagebench/test_ondemand_download_churn.py | 17 +++++++-------
 test_runner/performance/test_layer_map.py     | 23 +++++++++++--------
 test_runner/regress/test_combocid.py          | 20 +++++++++++++---
 4 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 5829a1c188..dbfc9f3544 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -88,6 +88,8 @@ pub async fn shutdown_pageserver(
 ) {
     use std::time::Duration;
 
+    let started_at = std::time::Instant::now();
+
     // If the orderly shutdown below takes too long, we still want to make
     // sure that all walredo processes are killed and wait()ed on by us, not systemd.
     //
@@ -241,7 +243,10 @@ pub async fn shutdown_pageserver(
     walredo_extraordinary_shutdown_thread.join().unwrap();
     info!("walredo_extraordinary_shutdown_thread done");
 
-    info!("Shut down successfully completed");
+    info!(
+        elapsed_ms = started_at.elapsed().as_millis(),
+        "Shut down successfully completed"
+    );
     std::process::exit(exit_code);
 }
 
diff --git a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
index 0348b08f04..9ad6e7907c 100644
--- a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
+++ b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
@@ -5,8 +5,12 @@ from typing import Any, Dict, Tuple
 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
-from fixtures.pageserver.utils import wait_for_upload_queue_empty
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    flush_ep_to_pageserver,
+)
 from fixtures.remote_storage import s3_storage
 from fixtures.utils import humantime_to_ms
 
@@ -62,9 +66,6 @@ def test_download_churn(
 
     run_benchmark(env, pg_bin, record, io_engine, concurrency_per_target, duration)
 
-    # see https://github.com/neondatabase/neon/issues/8712
-    env.stop(immediate=True)
-
 
 def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     remote_storage_kind = s3_storage()
@@ -98,9 +99,9 @@ def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             f"INSERT INTO data SELECT lpad(i::text, {bytes_per_row}, '0') FROM generate_series(1, {int(nrows)})  as i",
             options="-c statement_timeout=0",
         )
-        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
-    # TODO: this is a bit imprecise, there could be frozen layers being written out that we don't observe here
-    wait_for_upload_queue_empty(client, tenant_id, timeline_id)
+        flush_ep_to_pageserver(env, ep, tenant_id, timeline_id)
+
+    client.timeline_checkpoint(tenant_id, timeline_id, compact=False, wait_until_uploaded=True)
 
     return env
 
diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py
index 890b70b9fc..bc6d9de346 100644
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -1,20 +1,21 @@
 import time
 
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
 
 
-#
-# Benchmark searching the layer map, when there are a lot of small layer files.
-#
 def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
-    env = neon_env_builder.init_start()
+    """Benchmark searching the layer map, when there are a lot of small layer files."""
+
+    env = neon_env_builder.init_configs()
     n_iters = 10
     n_records = 100000
 
+    env.start()
+
     # We want to have a lot of lot of layer files to exercise the layer map. Disable
     # GC, and make checkpoint_distance very small, so that we get a lot of small layer
     # files.
-    tenant, _ = env.neon_cli.create_tenant(
+    tenant, timeline = env.neon_cli.create_tenant(
         conf={
             "gc_period": "0s",
             "checkpoint_distance": "16384",
@@ -24,8 +25,7 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
         }
     )
 
-    env.neon_cli.create_timeline("test_layer_map", tenant_id=tenant)
-    endpoint = env.endpoints.create_start("test_layer_map", tenant_id=tenant)
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant)
     cur = endpoint.connect().cursor()
     cur.execute("create table t(x integer)")
     for _ in range(n_iters):
@@ -33,9 +33,12 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
         time.sleep(1)
 
     cur.execute("vacuum t")
+
     with zenbenchmark.record_duration("test_query"):
         cur.execute("SELECT count(*) from t")
         assert cur.fetchone() == (n_iters * n_records,)
 
-    # see https://github.com/neondatabase/neon/issues/8712
-    env.stop(immediate=True)
+    flush_ep_to_pageserver(env, endpoint, tenant, timeline)
+    env.pageserver.http_client().timeline_checkpoint(
+        tenant, timeline, compact=False, wait_until_uploaded=True
+    )
diff --git a/test_runner/regress/test_combocid.py b/test_runner/regress/test_combocid.py
index 6d2567b7ee..41907b1f20 100644
--- a/test_runner/regress/test_combocid.py
+++ b/test_runner/regress/test_combocid.py
@@ -1,4 +1,4 @@
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
 
 
 def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
@@ -34,7 +34,7 @@ def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
 
     # Clear the cache, so that we exercise reconstructing the pages
     # from WAL
-    cur.execute("SELECT clear_buffer_cache()")
+    endpoint.clear_shared_buffers()
 
     # Check that the cursor opened earlier still works. If the
     # combocids are not restored correctly, it won't.
@@ -43,6 +43,10 @@ def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
     assert len(rows) == 500
 
     cur.execute("rollback")
+    flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline)
+    env.pageserver.http_client().timeline_checkpoint(
+        env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True
+    )
 
 
 def test_combocid_delete(neon_env_builder: NeonEnvBuilder):
@@ -92,7 +96,7 @@ def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder):
     cur.execute("delete from t")
     # Clear the cache, so that we exercise reconstructing the pages
     # from WAL
-    cur.execute("SELECT clear_buffer_cache()")
+    endpoint.clear_shared_buffers()
 
     # Check that the cursor opened earlier still works. If the
     # combocids are not restored correctly, it won't.
@@ -102,6 +106,11 @@ def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder):
 
     cur.execute("rollback")
 
+    flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline)
+    env.pageserver.http_client().timeline_checkpoint(
+        env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True
+    )
+
 
 def test_combocid(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
@@ -137,3 +146,8 @@ def test_combocid(neon_env_builder: NeonEnvBuilder):
     assert cur.rowcount == n_records
 
     cur.execute("rollback")
+
+    flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline)
+    env.pageserver.http_client().timeline_checkpoint(
+        env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True
+    )

From a968554a8c36c2accf17c5a1f2f23c2bc2f2ec47 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 21 Aug 2024 16:25:21 -0400
Subject: [PATCH 27/55] fix(pageserver): unify initdb optimization for sparse
 keyspaces; fix force img generation (#8776)

close https://github.com/neondatabase/neon/issues/8558

* Directly generate image layers for sparse keyspaces during initdb
optimization.
* Support force image layer generation for sparse keyspaces.
* Fix a bug of incorrect image layer key range in case of duplicated
keys. (The added line: `start = img_range.end;`) This can cause
overlapping image layers and keys to disappear.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs | 64 ++++++++++++-------------------
 1 file changed, 24 insertions(+), 40 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 9732cf8b50..80e3843021 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3589,34 +3589,6 @@ impl Timeline {
                 return Err(FlushLayerError::Cancelled);
             }
 
-            // FIXME(auxfilesv2): support multiple metadata key partitions might need initdb support as well?
-            // This code path will not be hit during regression tests. After #7099 we have a single partition
-            // with two key ranges. If someone wants to fix initdb optimization in the future, this might need
-            // to be fixed.
-
-            // For metadata, always create delta layers.
-            let delta_layer = if !metadata_partition.parts.is_empty() {
-                assert_eq!(
-                    metadata_partition.parts.len(),
-                    1,
-                    "currently sparse keyspace should only contain a single metadata keyspace"
-                );
-                let metadata_keyspace = &metadata_partition.parts[0];
-                self.create_delta_layer(
-                    &frozen_layer,
-                    Some(
-                        metadata_keyspace.0.ranges.first().unwrap().start
-                            ..metadata_keyspace.0.ranges.last().unwrap().end,
-                    ),
-                    ctx,
-                )
-                .await
-                .map_err(|e| FlushLayerError::from_anyhow(self, e))?
-            } else {
-                None
-            };
-
-            // For image layers, we add them immediately into the layer map.
             let mut layers_to_upload = Vec::new();
             layers_to_upload.extend(
                 self.create_image_layers(
@@ -3627,13 +3599,27 @@ impl Timeline {
                 )
                 .await?,
             );
-
-            if let Some(delta_layer) = delta_layer {
-                layers_to_upload.push(delta_layer.clone());
-                (layers_to_upload, Some(delta_layer))
-            } else {
-                (layers_to_upload, None)
+            if !metadata_partition.parts.is_empty() {
+                assert_eq!(
+                    metadata_partition.parts.len(),
+                    1,
+                    "currently sparse keyspace should only contain a single metadata keyspace"
+                );
+                layers_to_upload.extend(
+                    self.create_image_layers(
+                        // Safety: create_image_layers treat sparse keyspaces differently that it does not scan
+                        // every single key within the keyspace, and therefore, it's safe to force converting it
+                        // into a dense keyspace before calling this function.
+                        &metadata_partition.into_dense(),
+                        self.initdb_lsn,
+                        ImageLayerCreationMode::Initial,
+                        ctx,
+                    )
+                    .await?,
+                );
             }
+
+            (layers_to_upload, None)
         } else {
             // Normal case, write out a L0 delta layer file.
             // `create_delta_layer` will not modify the layer map.
@@ -4043,8 +4029,6 @@ impl Timeline {
         mode: ImageLayerCreationMode,
         start: Key,
     ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
-        assert!(!matches!(mode, ImageLayerCreationMode::Initial));
-
         // Metadata keys image layer creation.
         let mut reconstruct_state = ValuesReconstructState::default();
         let data = self
@@ -4210,15 +4194,13 @@ impl Timeline {
                         "metadata keys must be partitioned separately"
                     );
                 }
-                if mode == ImageLayerCreationMode::Initial {
-                    return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
-                }
                 if mode == ImageLayerCreationMode::Try && !check_for_image_layers {
                     // Skip compaction if there are not enough updates. Metadata compaction will do a scan and
                     // might mess up with evictions.
                     start = img_range.end;
                     continue;
                 }
+                // For initial and force modes, we always generate image layers for metadata keys.
             } else if let ImageLayerCreationMode::Try = mode {
                 // check_for_image_layers = false -> skip
                 // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
@@ -4226,7 +4208,8 @@ impl Timeline {
                     start = img_range.end;
                     continue;
                 }
-            } else if let ImageLayerCreationMode::Force = mode {
+            }
+            if let ImageLayerCreationMode::Force = mode {
                 // When forced to create image layers, we might try and create them where they already
                 // exist.  This mode is only used in tests/debug.
                 let layers = self.layers.read().await;
@@ -4240,6 +4223,7 @@ impl Timeline {
                         img_range.start,
                         img_range.end
                     );
+                    start = img_range.end;
                     continue;
                 }
             }

From 7c74112b2a6e23c07bfd9cc62c240cd6bbdd3bd9 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 22 Aug 2024 11:04:42 +0100
Subject: [PATCH 28/55] pageserver: batch InMemoryLayer `put`s, remove need to
 sort items by LSN during ingest (#8591)

## Problem/Solution

TimelineWriter::put_batch is simply a loop over individual puts. Each
put acquires and releases locks, and checks for potentially starting a
new layer. Batching these is more efficient, but more importantly
unlocks future changes where we can pre-build serialized buffers much
earlier in the ingest process, potentially even on the safekeeper
(imagine a future model where some variant of DatadirModification lives
on the safekeeper).

Ensuring that the values in put_batch are written to one layer also
enables a simplification upstream, where we no longer need to write
values in LSN-order. This saves us a sort, but also simplifies follow-on
refactors to DatadirModification: we can store metadata keys and data
keys separately at that level without needing to zip them together in
LSN order later.

## Why?

In this PR, these changes are simplify optimizations, but they are
motivated by evolving the ingest path in the direction of disentangling
extracting DatadirModification from Timeline. It may not obvious how
right now, but the general idea is that we'll end up with three phases
of ingest:
- A) Decode walrecords and build a datadirmodification with all the
simple data contents already in a big serialized buffer ready to write
to an ephemeral layer **<-- this part can be pipelined and parallelized,
and done on a safekeeper!**
- B) Let that datadirmodification see a Timeline, so that it can also
generate all the metadata updates that require a read-modify-write of
existing pages
- C) Dump the results of B into an ephemeral layer.

Related: https://github.com/neondatabase/neon/issues/8452

## Caveats

Doing a big monolithic buffer of values to write to disk is ordinarily
an anti-pattern: we prefer nice streaming I/O. However:
- In future, when we do this first decode stage on the safekeeper, it
would be inefficient to serialize a Vec of Value, and then later
deserialize it just to add blob size headers while writing into the
ephemeral layer format. The idea is that for bulk write data, we will
serialize exactly once.
- The monolithic buffer is a stepping stone to pipelining more of this:
by seriailizing earlier (rather than at the final put_value), we will be
able to parallelize the wal decoding and bulk serialization of data page
writes.
- The ephemeral layer's buffered writer already stalls writes while it
waits to flush: so while yes we'll stall for a couple milliseconds to
write a couple megabytes, we already have stalls like this, just
distributed across smaller writes.

## Benchmarks

This PR is primarily a stepping stone to safekeeper ingest filtering,
but also provides a modest efficiency improvement to the `wal_recovery`
part of `test_bulk_ingest`.

test_bulk_ingest:

```
test_bulk_insert[neon-release-pg16].insert: 23.659 s
test_bulk_insert[neon-release-pg16].pageserver_writes: 5,428 MB
test_bulk_insert[neon-release-pg16].peak_mem: 626 MB
test_bulk_insert[neon-release-pg16].size: 0 MB
test_bulk_insert[neon-release-pg16].data_uploaded: 1,922 MB
test_bulk_insert[neon-release-pg16].num_files_uploaded: 8
test_bulk_insert[neon-release-pg16].wal_written: 1,382 MB
test_bulk_insert[neon-release-pg16].wal_recovery: 18.981 s
test_bulk_insert[neon-release-pg16].compaction: 0.055 s

vs. tip of main:
test_bulk_insert[neon-release-pg16].insert: 24.001 s
test_bulk_insert[neon-release-pg16].pageserver_writes: 5,428 MB
test_bulk_insert[neon-release-pg16].peak_mem: 604 MB
test_bulk_insert[neon-release-pg16].size: 0 MB
test_bulk_insert[neon-release-pg16].data_uploaded: 1,922 MB
test_bulk_insert[neon-release-pg16].num_files_uploaded: 8
test_bulk_insert[neon-release-pg16].wal_written: 1,382 MB
test_bulk_insert[neon-release-pg16].wal_recovery: 23.586 s
test_bulk_insert[neon-release-pg16].compaction: 0.054 s
```
---
 pageserver/benches/bench_ingest.rs            |  19 ++-
 pageserver/src/pgdatadir_mapping.rs           |  70 +++++++---
 pageserver/src/tenant/ephemeral_file.rs       |  35 +++--
 pageserver/src/tenant/storage_layer.rs        |   2 +-
 .../tenant/storage_layer/inmemory_layer.rs    | 131 +++++++++++++-----
 pageserver/src/tenant/timeline.rs             | 101 +++++++-------
 .../walreceiver/walreceiver_connection.rs     |   9 +-
 7 files changed, 247 insertions(+), 120 deletions(-)

diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index 0336302de0..bd99f5289d 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -10,6 +10,7 @@ use pageserver::{
     page_cache,
     repository::Value,
     task_mgr::TaskKind,
+    tenant::storage_layer::inmemory_layer::SerializedBatch,
     tenant::storage_layer::InMemoryLayer,
     virtual_file,
 };
@@ -67,12 +68,16 @@ async fn ingest(
     let layer =
         InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
 
-    let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
+    let data = Value::Image(Bytes::from(vec![0u8; put_size]));
+    let data_ser_size = data.serialized_size().unwrap() as usize;
     let ctx = RequestContext::new(
         pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
         pageserver::context::DownloadBehavior::Download,
     );
 
+    const BATCH_SIZE: usize = 16;
+    let mut batch = Vec::new();
+
     for i in 0..put_count {
         lsn += put_size as u64;
 
@@ -95,7 +100,17 @@ async fn ingest(
             }
         }
 
-        layer.put_value(key.to_compact(), lsn, &data, &ctx).await?;
+        batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
+        if batch.len() >= BATCH_SIZE {
+            let this_batch = std::mem::take(&mut batch);
+            let serialized = SerializedBatch::from_values(this_batch);
+            layer.put_batch(serialized, &ctx).await?;
+        }
+    }
+    if !batch.is_empty() {
+        let this_batch = std::mem::take(&mut batch);
+        let serialized = SerializedBatch::from_values(this_batch);
+        layer.put_batch(serialized, &ctx).await?;
     }
     layer.freeze(lsn + 1).await;
 
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 4f7eb1a00c..d6e0b82e1d 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -15,12 +15,11 @@ use crate::{aux_file, repository::*};
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
-use itertools::Itertools;
 use pageserver_api::key::{
     dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
     relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
     slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
-    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
+    CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
 use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::models::AuxFilePolicy;
@@ -37,7 +36,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
-use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
@@ -174,6 +172,7 @@ impl Timeline {
             pending_deletions: Vec::new(),
             pending_nblocks: 0,
             pending_directory_entries: Vec::new(),
+            pending_bytes: 0,
             lsn,
         }
     }
@@ -1022,21 +1021,33 @@ pub struct DatadirModification<'a> {
     // The put-functions add the modifications here, and they are flushed to the
     // underlying key-value store by the 'finish' function.
     pending_lsns: Vec<Lsn>,
-    pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
+    pending_updates: HashMap<Key, Vec<(Lsn, usize, Value)>>,
     pending_deletions: Vec<(Range<Key>, Lsn)>,
     pending_nblocks: i64,
 
     /// For special "directory" keys that store key-value maps, track the size of the map
     /// if it was updated in this modification.
     pending_directory_entries: Vec<(DirectoryKind, usize)>,
+
+    /// An **approximation** of how large our EphemeralFile write will be when committed.
+    pending_bytes: usize,
 }
 
 impl<'a> DatadirModification<'a> {
+    // When a DatadirModification is committed, we do a monolithic serialization of all its contents.  WAL records can
+    // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
+    // additionally specify a limit on how much payload a DatadirModification may contain before it should be committed.
+    pub(crate) const MAX_PENDING_BYTES: usize = 8 * 1024 * 1024;
+
     /// Get the current lsn
     pub(crate) fn get_lsn(&self) -> Lsn {
         self.lsn
     }
 
+    pub(crate) fn approx_pending_bytes(&self) -> usize {
+        self.pending_bytes
+    }
+
     /// Set the current lsn
     pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
         ensure!(
@@ -1769,21 +1780,25 @@ impl<'a> DatadirModification<'a> {
         // Flush relation and  SLRU data blocks, keep metadata.
         let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
         for (key, values) in self.pending_updates.drain() {
-            for (lsn, value) in values {
+            let mut write_batch = Vec::new();
+            for (lsn, value_ser_size, value) in values {
                 if key.is_rel_block_key() || key.is_slru_block_key() {
                     // This bails out on first error without modifying pending_updates.
                     // That's Ok, cf this function's doc comment.
-                    writer.put(key, lsn, &value, ctx).await?;
+                    write_batch.push((key.to_compact(), lsn, value_ser_size, value));
                 } else {
-                    retained_pending_updates
-                        .entry(key)
-                        .or_default()
-                        .push((lsn, value));
+                    retained_pending_updates.entry(key).or_default().push((
+                        lsn,
+                        value_ser_size,
+                        value,
+                    ));
                 }
             }
+            writer.put_batch(write_batch, ctx).await?;
         }
 
         self.pending_updates = retained_pending_updates;
+        self.pending_bytes = 0;
 
         if pending_nblocks != 0 {
             writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1809,17 +1824,20 @@ impl<'a> DatadirModification<'a> {
         self.pending_nblocks = 0;
 
         if !self.pending_updates.is_empty() {
-            // The put_batch call below expects expects the inputs to be sorted by Lsn,
-            // so we do that first.
-            let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
-                self.pending_updates
-                    .drain()
-                    .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
-                    .kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
-                VecMapOrdering::GreaterOrEqual,
-            );
+            // Ordering: the items in this batch do not need to be in any global order, but values for
+            // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
+            // this to do efficient updates to its index.
+            let batch: Vec<(CompactKey, Lsn, usize, Value)> = self
+                .pending_updates
+                .drain()
+                .flat_map(|(key, values)| {
+                    values.into_iter().map(move |(lsn, val_ser_size, value)| {
+                        (key.to_compact(), lsn, val_ser_size, value)
+                    })
+                })
+                .collect::<Vec<_>>();
 
-            writer.put_batch(lsn_ordered_batch, ctx).await?;
+            writer.put_batch(batch, ctx).await?;
         }
 
         if !self.pending_deletions.is_empty() {
@@ -1844,6 +1862,8 @@ impl<'a> DatadirModification<'a> {
             writer.update_directory_entries_count(kind, count as u64);
         }
 
+        self.pending_bytes = 0;
+
         Ok(())
     }
 
@@ -1860,7 +1880,7 @@ impl<'a> DatadirModification<'a> {
         // Note: we don't check pending_deletions. It is an error to request a
         // value that has been removed, deletion only avoids leaking storage.
         if let Some(values) = self.pending_updates.get(&key) {
-            if let Some((_, value)) = values.last() {
+            if let Some((_, _, value)) = values.last() {
                 return if let Value::Image(img) = value {
                     Ok(img.clone())
                 } else {
@@ -1888,13 +1908,17 @@ impl<'a> DatadirModification<'a> {
     fn put(&mut self, key: Key, val: Value) {
         let values = self.pending_updates.entry(key).or_default();
         // Replace the previous value if it exists at the same lsn
-        if let Some((last_lsn, last_value)) = values.last_mut() {
+        if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
             if *last_lsn == self.lsn {
+                *last_value_ser_size = val.serialized_size().unwrap() as usize;
                 *last_value = val;
                 return;
             }
         }
-        values.push((self.lsn, val));
+
+        let val_serialized_size = val.serialized_size().unwrap() as usize;
+        self.pending_bytes += val_serialized_size;
+        values.push((self.lsn, val_serialized_size, val));
     }
 
     fn delete(&mut self, key_range: Range<Key>) {
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 3eb8384d05..44f0fc7ab1 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -79,6 +79,8 @@ impl EphemeralFile {
         self.rw.read_blk(blknum, ctx).await
     }
 
+    #[cfg(test)]
+    // This is a test helper: outside of tests, we are always written to via a pre-serialized batch.
     pub(crate) async fn write_blob(
         &mut self,
         srcbuf: &[u8],
@@ -86,17 +88,30 @@ impl EphemeralFile {
     ) -> Result<u64, io::Error> {
         let pos = self.rw.bytes_written();
 
-        // Write the length field
-        if srcbuf.len() < 0x80 {
-            // short one-byte length header
-            let len_buf = [srcbuf.len() as u8];
+        let mut len_bytes = std::io::Cursor::new(Vec::new());
+        crate::tenant::storage_layer::inmemory_layer::SerializedBatch::write_blob_length(
+            srcbuf.len(),
+            &mut len_bytes,
+        );
+        let len_bytes = len_bytes.into_inner();
 
-            self.rw.write_all_borrowed(&len_buf, ctx).await?;
-        } else {
-            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
-            len_buf[0] |= 0x80;
-            self.rw.write_all_borrowed(&len_buf, ctx).await?;
-        }
+        // Write the length field
+        self.rw.write_all_borrowed(&len_bytes, ctx).await?;
+
+        // Write the payload
+        self.rw.write_all_borrowed(srcbuf, ctx).await?;
+
+        Ok(pos)
+    }
+
+    /// Returns the offset at which the first byte of the input was written, for use
+    /// in constructing indices over the written value.
+    pub(crate) async fn write_raw(
+        &mut self,
+        srcbuf: &[u8],
+        ctx: &RequestContext,
+    ) -> Result<u64, io::Error> {
+        let pos = self.rw.bytes_written();
 
         // Write the payload
         self.rw.write_all_borrowed(srcbuf, ctx).await?;
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 04f89db401..133b34b8b5 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -2,7 +2,7 @@
 
 pub mod delta_layer;
 pub mod image_layer;
-pub(crate) mod inmemory_layer;
+pub mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 130d1002a0..a71b4dd83b 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -33,7 +33,7 @@ use std::fmt::Write;
 use std::ops::Range;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
-use tokio::sync::{RwLock, RwLockWriteGuard};
+use tokio::sync::RwLock;
 
 use super::{
     DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
@@ -320,6 +320,82 @@ impl InMemoryLayer {
     }
 }
 
+/// Offset of a particular Value within a serialized batch.
+struct SerializedBatchOffset {
+    key: CompactKey,
+    lsn: Lsn,
+    /// offset in bytes from the start of the batch's buffer to the Value's serialized size header.
+    offset: u64,
+}
+
+pub struct SerializedBatch {
+    /// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`].
+    pub(crate) raw: Vec<u8>,
+
+    /// Index of values in [`Self::raw`], using offsets relative to the start of the buffer.
+    offsets: Vec<SerializedBatchOffset>,
+
+    /// The highest LSN of any value in the batch
+    pub(crate) max_lsn: Lsn,
+}
+
+impl SerializedBatch {
+    /// Write a blob length in the internal format of the EphemeralFile
+    pub(crate) fn write_blob_length(len: usize, cursor: &mut std::io::Cursor<Vec<u8>>) {
+        use std::io::Write;
+
+        if len < 0x80 {
+            // short one-byte length header
+            let len_buf = [len as u8];
+
+            cursor
+                .write_all(&len_buf)
+                .expect("Writing to Vec is infallible");
+        } else {
+            let mut len_buf = u32::to_be_bytes(len as u32);
+            len_buf[0] |= 0x80;
+            cursor
+                .write_all(&len_buf)
+                .expect("Writing to Vec is infallible");
+        }
+    }
+
+    pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> Self {
+        // Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
+        // [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
+        let buffer_size = batch.iter().map(|i| i.2).sum::<usize>() + 4 * batch.len();
+        let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(buffer_size));
+
+        let mut offsets: Vec<SerializedBatchOffset> = Vec::with_capacity(batch.len());
+        let mut max_lsn: Lsn = Lsn(0);
+        for (key, lsn, val_ser_size, val) in batch {
+            let relative_off = cursor.position();
+
+            Self::write_blob_length(val_ser_size, &mut cursor);
+            val.ser_into(&mut cursor)
+                .expect("Writing into in-memory buffer is infallible");
+
+            offsets.push(SerializedBatchOffset {
+                key,
+                lsn,
+                offset: relative_off,
+            });
+            max_lsn = std::cmp::max(max_lsn, lsn);
+        }
+
+        let buffer = cursor.into_inner();
+
+        // Assert that we didn't do any extra allocations while building buffer.
+        debug_assert!(buffer.len() <= buffer_size);
+
+        Self {
+            raw: buffer,
+            offsets,
+            max_lsn,
+        }
+    }
+}
+
 fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
     write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
 }
@@ -380,37 +456,20 @@ impl InMemoryLayer {
         })
     }
 
-    // Write operations
-
-    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
-    /// Adds the page version to the in-memory tree
-    pub async fn put_value(
+    // Write path.
+    pub async fn put_batch(
         &self,
-        key: CompactKey,
-        lsn: Lsn,
-        buf: &[u8],
+        serialized_batch: SerializedBatch,
         ctx: &RequestContext,
     ) -> Result<()> {
         let mut inner = self.inner.write().await;
         self.assert_writable();
-        self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
-    }
 
-    async fn put_value_locked(
-        &self,
-        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
-        key: CompactKey,
-        lsn: Lsn,
-        buf: &[u8],
-        ctx: &RequestContext,
-    ) -> Result<()> {
-        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
-
-        let off = {
-            locked_inner
+        let base_off = {
+            inner
                 .file
-                .write_blob(
-                    buf,
+                .write_raw(
+                    &serialized_batch.raw,
                     &RequestContextBuilder::extend(ctx)
                         .page_content_kind(PageContentKind::InMemoryLayer)
                         .build(),
@@ -418,15 +477,23 @@ impl InMemoryLayer {
                 .await?
         };
 
-        let vec_map = locked_inner.index.entry(key).or_default();
-        let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
-        if old.is_some() {
-            // We already had an entry for this LSN. That's odd..
-            warn!("Key {} at {} already exists", key, lsn);
+        for SerializedBatchOffset {
+            key,
+            lsn,
+            offset: relative_off,
+        } in serialized_batch.offsets
+        {
+            let off = base_off + relative_off;
+            let vec_map = inner.index.entry(key).or_default();
+            let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
+            if old.is_some() {
+                // We already had an entry for this LSN. That's odd..
+                warn!("Key {} at {} already exists", key, lsn);
+            }
         }
 
-        let size = locked_inner.file.len();
-        locked_inner.resource_units.maybe_publish_size(size);
+        let size = inner.file.len();
+        inner.resource_units.maybe_publish_size(size);
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 80e3843021..e90f65942f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -22,8 +22,8 @@ use handle::ShardTimelineId;
 use once_cell::sync::Lazy;
 use pageserver_api::{
     key::{
-        KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
-        NON_INHERITED_SPARSE_RANGE,
+        CompactKey, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
+        NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
     },
     keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
     models::{
@@ -44,10 +44,8 @@ use tokio::{
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{
-    bin_ser::BeSer,
     fs_ext, pausable_failpoint,
     sync::gate::{Gate, GateGuard},
-    vec_map::VecMap,
 };
 
 use std::pin::pin;
@@ -137,7 +135,10 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
-use super::{config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized};
+use super::{
+    config::TenantConf, storage_layer::inmemory_layer, storage_layer::LayerVisibilityHint,
+    upload_queue::NotInitialized,
+};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{
@@ -5574,44 +5575,6 @@ enum OpenLayerAction {
 }
 
 impl<'a> TimelineWriter<'a> {
-    /// Put a new page version that can be constructed from a WAL record
-    ///
-    /// This will implicitly extend the relation, if the page is beyond the
-    /// current end-of-file.
-    pub(crate) async fn put(
-        &mut self,
-        key: Key,
-        lsn: Lsn,
-        value: &Value,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // Avoid doing allocations for "small" values.
-        // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
-        // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
-        let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
-        value.ser_into(&mut buf)?;
-        let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
-
-        let action = self.get_open_layer_action(lsn, buf_size);
-        let layer = self.handle_open_layer_action(lsn, action, ctx).await?;
-        let res = layer.put_value(key.to_compact(), lsn, &buf, ctx).await;
-
-        if res.is_ok() {
-            // Update the current size only when the entire write was ok.
-            // In case of failures, we may have had partial writes which
-            // render the size tracking out of sync. That's ok because
-            // the checkpoint distance should be significantly smaller
-            // than the S3 single shot upload limit of 5GiB.
-            let state = self.write_guard.as_mut().unwrap();
-
-            state.current_size += buf_size;
-            state.prev_lsn = Some(lsn);
-            state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn));
-        }
-
-        res
-    }
-
     async fn handle_open_layer_action(
         &mut self,
         at: Lsn,
@@ -5717,18 +5680,58 @@ impl<'a> TimelineWriter<'a> {
     }
 
     /// Put a batch of keys at the specified Lsns.
-    ///
-    /// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`].
     pub(crate) async fn put_batch(
         &mut self,
-        batch: VecMap<Lsn, (Key, Value)>,
+        batch: Vec<(CompactKey, Lsn, usize, Value)>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        for (lsn, (key, val)) in batch {
-            self.put(key, lsn, &val, ctx).await?
+        if batch.is_empty() {
+            return Ok(());
         }
 
-        Ok(())
+        let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch);
+        let batch_max_lsn = serialized_batch.max_lsn;
+        let buf_size: u64 = serialized_batch.raw.len() as u64;
+
+        let action = self.get_open_layer_action(batch_max_lsn, buf_size);
+        let layer = self
+            .handle_open_layer_action(batch_max_lsn, action, ctx)
+            .await?;
+
+        let res = layer.put_batch(serialized_batch, ctx).await;
+
+        if res.is_ok() {
+            // Update the current size only when the entire write was ok.
+            // In case of failures, we may have had partial writes which
+            // render the size tracking out of sync. That's ok because
+            // the checkpoint distance should be significantly smaller
+            // than the S3 single shot upload limit of 5GiB.
+            let state = self.write_guard.as_mut().unwrap();
+
+            state.current_size += buf_size;
+            state.prev_lsn = Some(batch_max_lsn);
+            state.max_lsn = std::cmp::max(state.max_lsn, Some(batch_max_lsn));
+        }
+
+        res
+    }
+
+    #[cfg(test)]
+    /// Test helper, for tests that would like to poke individual values without composing a batch
+    pub(crate) async fn put(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        value: &Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        use utils::bin_ser::BeSer;
+        let val_ser_size = value.serialized_size().unwrap() as usize;
+        self.put_batch(
+            vec![(key.to_compact(), lsn, val_ser_size, value.clone())],
+            ctx,
+        )
+        .await
     }
 
     pub(crate) async fn delete_batch(
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index b5c577af72..0114473eda 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -27,8 +27,8 @@ use super::TaskStateUpdate;
 use crate::{
     context::RequestContext,
     metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
-    task_mgr::TaskKind,
-    task_mgr::WALRECEIVER_RUNTIME,
+    pgdatadir_mapping::DatadirModification,
+    task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
     tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
     walingest::WalIngest,
     walrecord::DecodedWALRecord,
@@ -345,7 +345,10 @@ pub(super) async fn handle_walreceiver_connection(
                         // Commit every ingest_batch_size records. Even if we filtered out
                         // all records, we still need to call commit to advance the LSN.
                         uncommitted_records += 1;
-                        if uncommitted_records >= ingest_batch_size {
+                        if uncommitted_records >= ingest_batch_size
+                            || modification.approx_pending_bytes()
+                                > DatadirModification::MAX_PENDING_BYTES
+                        {
                             WAL_INGEST
                                 .records_committed
                                 .inc_by(uncommitted_records - filtered_records);

From d645645fab662df28ffb41dde18ca1963c237532 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 22 Aug 2024 12:45:29 +0200
Subject: [PATCH 29/55] Sleep in test_scrubber_physical_gc (#8798)

This copies a piece of code from `test_scrubber_physical_gc_ancestors`
to fix a source of flakiness: later on we rely on stuff being older than
a second, but the test can run faster under optimal conditions (as
happened to me locally, but also obvservable in
[this](https://neon-github-public-dev.s3.amazonaws.com/reports/main/10470762360/index.html#testresult/f713b02657db4b4c/retries)
allure report):

```
test_runner/regress/test_storage_scrubber.py:169: in test_scrubber_physical_gc
    assert gc_summary["remote_storage_errors"] == 0
E   assert 1 == 0
```
---
 test_runner/regress/test_storage_scrubber.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 2844d1b1d2..292a9a1010 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -152,6 +152,9 @@ def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Opt
         # This write includes remote upload, will generate an index in this generation
         workload.write_rows(1)
 
+    # We will use a min_age_secs=1 threshold for deletion, let it pass
+    time.sleep(2)
+
     # With a high min_age, the scrubber should decline to delete anything
     gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600)
     assert gc_summary["remote_storage_errors"] == 0

From 0e6c0d47a5d29e151d1a8013e627998df8772f6f Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 22 Aug 2024 12:52:36 +0200
Subject: [PATCH 30/55] Revert "Use sycnhronous commit for logical replicaiton
 worker (#8645)" (#8792)

This reverts commit cbe8c77997aea576a96a7f8d31147cb7a11d6a6b.

This change was originally made to test a hypothesis, but after that,
the proper fix #8669 was merged, so now it's not needed. Moreover, the
test is still flaky, so probably this bug was not a reason of the
flakiness.

Related to #8097
---
 test_runner/regress/test_subscriber_restart.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py
index 4581008022..91caad7220 100644
--- a/test_runner/regress/test_subscriber_restart.py
+++ b/test_runner/regress/test_subscriber_restart.py
@@ -37,9 +37,7 @@ def test_subscriber_restart(neon_simple_env: NeonEnv):
             scur.execute("CREATE TABLE t (pk integer primary key, sk integer)")
             # scur.execute("CREATE INDEX on t(sk)") # slowdown applying WAL at replica
             pub_conn = f"host=localhost port={pub.pg_port} dbname=postgres user=cloud_admin"
-            # synchronous_commit=on to test a hypothesis for why this test has been flaky.
-            # XXX: Add link to the issue
-            query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub with (synchronous_commit=on)"
+            query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub"
             scur.execute(query)
             time.sleep(2)  # let initial table sync complete
 

From 1a9d559be8a77e7d8375c10238e4e4c0e76a40f7 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Thu, 22 Aug 2024 13:29:05 +0200
Subject: [PATCH 31/55] proxy: Enable stricter/pedantic clippy checks (#8775)

Create a list of currently allowed exceptions that should be reduced
over time.
---
 proxy/src/lib.rs | 90 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 89 insertions(+), 1 deletion(-)

diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index ea92eaaa55..b7d497ebcc 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -1,4 +1,92 @@
-#![deny(clippy::undocumented_unsafe_blocks)]
+// rustc lints/lint groups
+// https://doc.rust-lang.org/rustc/lints/groups.html
+#![deny(
+    deprecated,
+    future_incompatible,
+    // TODO: consider let_underscore
+    nonstandard_style,
+    rust_2024_compatibility
+)]
+#![warn(clippy::all, clippy::pedantic, clippy::cargo)]
+// List of denied lints from the clippy::restriction group.
+// https://rust-lang.github.io/rust-clippy/master/index.html#?groups=restriction
+#![warn(
+    clippy::undocumented_unsafe_blocks,
+    clippy::dbg_macro,
+    clippy::empty_enum_variants_with_brackets,
+    clippy::exit,
+    clippy::float_cmp_const,
+    clippy::lossy_float_literal,
+    clippy::macro_use_imports,
+    clippy::manual_ok_or,
+    // TODO: consider clippy::map_err_ignore
+    // TODO: consider clippy::mem_forget
+    clippy::rc_mutex,
+    clippy::rest_pat_in_fully_bound_structs,
+    clippy::string_add,
+    clippy::string_to_string,
+    clippy::todo,
+    // TODO: consider clippy::unimplemented
+    // TODO: consider clippy::unwrap_used
+)]
+// List of permanently allowed lints.
+#![allow(
+    // It's ok to cast u8 to bool, etc.
+    clippy::cast_lossless,
+)]
+// List of temporarily allowed lints.
+// TODO: Switch to except() once stable with 1.81.
+// TODO: fix code and reduce list or move to permanent list above.
+#![allow(
+    clippy::cargo_common_metadata,
+    clippy::cast_possible_truncation,
+    clippy::cast_possible_wrap,
+    clippy::cast_precision_loss,
+    clippy::cast_sign_loss,
+    clippy::default_trait_access,
+    clippy::doc_markdown,
+    clippy::explicit_iter_loop,
+    clippy::float_cmp,
+    clippy::if_not_else,
+    clippy::ignored_unit_patterns,
+    clippy::implicit_hasher,
+    clippy::inconsistent_struct_constructor,
+    clippy::inline_always,
+    clippy::items_after_statements,
+    clippy::manual_assert,
+    clippy::manual_let_else,
+    clippy::manual_string_new,
+    clippy::match_bool,
+    clippy::match_same_arms,
+    clippy::match_wild_err_arm,
+    clippy::missing_errors_doc,
+    clippy::missing_panics_doc,
+    clippy::module_name_repetitions,
+    clippy::multiple_crate_versions,
+    clippy::must_use_candidate,
+    clippy::needless_for_each,
+    clippy::needless_pass_by_value,
+    clippy::needless_raw_string_hashes,
+    clippy::option_as_ref_cloned,
+    clippy::redundant_closure_for_method_calls,
+    clippy::redundant_else,
+    clippy::return_self_not_must_use,
+    clippy::similar_names,
+    clippy::single_char_pattern,
+    clippy::single_match_else,
+    clippy::struct_excessive_bools,
+    clippy::struct_field_names,
+    clippy::too_many_lines,
+    clippy::uninlined_format_args,
+    clippy::unnested_or_patterns,
+    clippy::unreadable_literal,
+    clippy::unused_async,
+    clippy::unused_self,
+    clippy::used_underscore_binding,
+    clippy::wildcard_imports
+)]
+// List of temporarily allowed lints to unblock beta/nightly.
+#![allow(unknown_lints, clippy::manual_inspect)]
 
 use std::convert::Infallible;
 

From b1c457898b7af111cd59d3a8c2d3bde5bae5085e Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 22 Aug 2024 18:38:03 +0300
Subject: [PATCH 32/55] test_compatibility: flush in the end (#8804)

`test_forward_compatibility` is still often failing at graceful
shutdown. Fix this by explicit flush before shutdown.

Example:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/10506613738/index.html#testresult/5e7111907f7ecfb2/

Cc: #8655 and #8708
Previous attempt: #8787
---
 test_runner/regress/test_compatibility.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index de27191945..c361efe90a 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -11,7 +11,12 @@ import pytest
 import toml
 from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, flush_ep_to_pageserver
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    flush_ep_to_pageserver,
+)
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
@@ -296,7 +301,7 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
     pg_version = env.pg_version
 
     # Stop endpoint while we recreate timeline
-    ep.stop()
+    flush_ep_to_pageserver(env, ep, tenant_id, timeline_id)
 
     try:
         pageserver_http.timeline_preserve_initdb_archive(tenant_id, timeline_id)
@@ -344,6 +349,11 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
     assert not dump_from_wal_differs, "dump from WAL differs"
     assert not initial_dump_differs, "initial dump differs"
 
+    flush_ep_to_pageserver(env, ep, tenant_id, timeline_id)
+    pageserver_http.timeline_checkpoint(
+        tenant_id, timeline_id, compact=False, wait_until_uploaded=True
+    )
+
 
 def dump_differs(
     first: Path, second: Path, output: Path, allowed_diffs: Optional[List[str]] = None

From 7a485b599bd27ba135e3327bfb5710c495c99df6 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 22 Aug 2024 23:53:37 +0300
Subject: [PATCH 33/55] Fix race condition in LRU list update in
 get_cached_relsize (#8807)

## Problem

See https://neondb.slack.com/archives/C07J14D8GTX/p1724347552023709
Manipulations with LRU list in relation size cache are performed under
shared lock

## Summary of changes

Take exclusive lock

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/relsize_cache.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c
index cc7ac2c394..2a4c2dc799 100644
--- a/pgxn/neon/relsize_cache.c
+++ b/pgxn/neon/relsize_cache.c
@@ -110,7 +110,8 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
 
 		tag.rinfo = rinfo;
 		tag.forknum = forknum;
-		LWLockAcquire(relsize_lock, LW_SHARED);
+		/* We need exclusive lock here because of LRU list manipulation */
+		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
 		entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
 		if (entry != NULL)
 		{

From 6eb638f4b390270fa004cdea45e00ca63c21f773 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 22 Aug 2024 17:31:38 -0400
Subject: [PATCH 34/55] feat(pageserver): warn on aux v1 tenants + default to
 v2 (#8625)

part of https://github.com/neondatabase/neon/issues/8623

We want to discover potential aux v1 customers that we might have missed
from the migrations.

## Summary of changes

Log warnings on basebackup, load timeline, and the first put_file.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/models.rs             |  2 +-
 pageserver/src/pgdatadir_mapping.rs           | 15 +++++++++++--
 pageserver/src/tenant.rs                      | 14 ++++++------
 pageserver/src/tenant/timeline.rs             |  5 +++++
 .../regress/test_logical_replication.py       | 22 +++++--------------
 5 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index ab4adfbebe..d55c06b685 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -348,7 +348,7 @@ impl AuxFilePolicy {
 
     /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
     pub fn default_tenant_config() -> Self {
-        Self::V1
+        Self::V2
     }
 }
 
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index d6e0b82e1d..b7110d69b6 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -726,7 +726,17 @@ impl Timeline {
     ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
         let current_policy = self.last_aux_file_policy.load();
         match current_policy {
-            Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
+            Some(AuxFilePolicy::V1) => {
+                warn!("this timeline is using deprecated aux file policy V1 (policy=V1)");
+                self.list_aux_files_v1(lsn, ctx).await
+            }
+            None => {
+                let res = self.list_aux_files_v1(lsn, ctx).await?;
+                if !res.is_empty() {
+                    warn!("this timeline is using deprecated aux file policy V1 (policy=None)");
+                }
+                Ok(res)
+            }
             Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
             Some(AuxFilePolicy::CrossValidation) => {
                 let v1_result = self.list_aux_files_v1(lsn, ctx).await;
@@ -1587,6 +1597,7 @@ impl<'a> DatadirModification<'a> {
                 if aux_files_key_v1.is_empty() {
                     None
                 } else {
+                    warn!("this timeline is using deprecated aux file policy V1");
                     self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
                     Some(AuxFilePolicy::V1)
                 }
@@ -2048,7 +2059,7 @@ mod tests {
 
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
+            .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
         let tline = tline.raw_timeline().unwrap();
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 65a7504b74..2e19a46ac8 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5932,10 +5932,10 @@ mod tests {
             .await
             .unwrap();
 
-        // the default aux file policy to switch is v1 if not set by the admins
+        // the default aux file policy to switch is v2 if not set by the admins
         assert_eq!(
             harness.tenant_conf.switch_aux_file_policy,
-            AuxFilePolicy::V1
+            AuxFilePolicy::default_tenant_config()
         );
         let (tenant, ctx) = harness.load().await;
 
@@ -5979,8 +5979,8 @@ mod tests {
         );
         assert_eq!(
             tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V1),
-            "aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1"
+            Some(AuxFilePolicy::V2),
+            "aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there"
         );
 
         // we can read everything from the storage
@@ -6002,8 +6002,8 @@ mod tests {
 
         assert_eq!(
             tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V1),
-            "keep v1 storage format when new files are written"
+            Some(AuxFilePolicy::V2),
+            "keep v2 storage format when new files are written"
         );
 
         let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
@@ -6019,7 +6019,7 @@ mod tests {
 
         // child copies the last flag even if that is not on remote storage yet
         assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
-        assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1));
+        assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
 
         let files = child.list_aux_files(lsn, &ctx).await.unwrap();
         assert_eq!(files.get("pg_logical/mappings/test1"), None);
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e90f65942f..dc9cddea43 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2234,6 +2234,11 @@ impl Timeline {
 
                 handles: Default::default(),
             };
+
+            if aux_file_policy == Some(AuxFilePolicy::V1) {
+                warn!("this timeline is using deprecated aux file policy V1");
+            }
+
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
 
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 0d18aa43b7..f83a833dda 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -22,7 +22,7 @@ def random_string(n: int):
 
 
 @pytest.mark.parametrize(
-    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.V2, AuxFileStore.CrossValidation]
+    "pageserver_aux_file_policy", [AuxFileStore.V2, AuxFileStore.CrossValidation]
 )
 def test_aux_file_v2_flag(neon_simple_env: NeonEnv, pageserver_aux_file_policy: AuxFileStore):
     env = neon_simple_env
@@ -31,9 +31,7 @@ def test_aux_file_v2_flag(neon_simple_env: NeonEnv, pageserver_aux_file_policy:
         assert pageserver_aux_file_policy == tenant_config["switch_aux_file_policy"]
 
 
-@pytest.mark.parametrize(
-    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
-)
+@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
@@ -175,9 +173,7 @@ COMMIT;
 
 
 # Test that neon.logical_replication_max_snap_files works
-@pytest.mark.parametrize(
-    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
-)
+@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
     def slot_removed(ep):
         assert (
@@ -355,9 +351,7 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
 #
 # Most pages start with a contrecord, so we don't do anything special
 # to ensure that.
-@pytest.mark.parametrize(
-    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
-)
+@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
@@ -402,9 +396,7 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
 # logical replication bug as such, but without logical replication,
 # records passed ot the WAL redo process are never large enough to hit
 # the bug.
-@pytest.mark.parametrize(
-    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
-)
+@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_large_records(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
@@ -476,9 +468,7 @@ def test_slots_and_branching(neon_simple_env: NeonEnv):
     ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')")
 
 
-@pytest.mark.parametrize(
-    "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
-)
+@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_replication_shutdown(neon_simple_env: NeonEnv):
     # Ensure Postgres can exit without stuck when a replication job is active + neon extension installed
     env = neon_simple_env

From ae63ac74887b9658c7a80f369b43247c1db51165 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 12 Aug 2024 14:57:50 -0500
Subject: [PATCH 35/55] Write messages field by field instead of bytes sheet in
 test_simple_sync_safekeepers

Co-authored-by: Arseny Sher <ars@neon.tech>
---
 libs/walproposer/build.rs           |   1 +
 libs/walproposer/src/walproposer.rs | 102 ++++++++++++++++++++--------
 2 files changed, 73 insertions(+), 30 deletions(-)

diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs
index 3126b170a4..7bb077062b 100644
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -95,6 +95,7 @@ fn main() -> anyhow::Result<()> {
         .allowlist_var("ERROR")
         .allowlist_var("FATAL")
         .allowlist_var("PANIC")
+        .allowlist_var("PG_VERSION_NUM")
         .allowlist_var("WPEVENT")
         .allowlist_var("WL_LATCH_SET")
         .allowlist_var("WL_SOCKET_READABLE")
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 37b1e0fa87..ba75171db2 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -282,7 +282,11 @@ mod tests {
     use std::cell::UnsafeCell;
     use utils::id::TenantTimelineId;
 
-    use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
+    use crate::{
+        api_bindings::Level,
+        bindings::{NeonWALReadResult, PG_VERSION_NUM},
+        walproposer::Wrapper,
+    };
 
     use super::ApiImpl;
 
@@ -489,41 +493,79 @@ mod tests {
 
         let (sender, receiver) = sync_channel(1);
 
+        // Messages definitions are at walproposer.h
+        // xxx: it would be better to extract them from safekeeper crate and
+        // use serialization/deserialization here.
+        let greeting_tag = (b'g' as u64).to_ne_bytes();
+        let proto_version = 2_u32.to_ne_bytes();
+        let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes();
+        let proposer_id = [0; 16];
+        let system_id = 0_u64.to_ne_bytes();
+        let tenant_id = ttid.tenant_id.as_arr();
+        let timeline_id = ttid.timeline_id.as_arr();
+        let pg_tli = 1_u32.to_ne_bytes();
+        let wal_seg_size = 16777216_u32.to_ne_bytes();
+        let proposer_greeting = [
+            greeting_tag.as_slice(),
+            proto_version.as_slice(),
+            pg_version.as_slice(),
+            proposer_id.as_slice(),
+            system_id.as_slice(),
+            tenant_id.as_slice(),
+            timeline_id.as_slice(),
+            pg_tli.as_slice(),
+            wal_seg_size.as_slice(),
+        ]
+        .concat();
+
+        let voting_tag = (b'v' as u64).to_ne_bytes();
+        let vote_request_term = 3_u64.to_ne_bytes();
+        let proposer_id = [0; 16];
+        let vote_request = [
+            voting_tag.as_slice(),
+            vote_request_term.as_slice(),
+            proposer_id.as_slice(),
+        ]
+        .concat();
+
+        let acceptor_greeting_term = 2_u64.to_ne_bytes();
+        let acceptor_greeting_node_id = 1_u64.to_ne_bytes();
+        let acceptor_greeting = [
+            greeting_tag.as_slice(),
+            acceptor_greeting_term.as_slice(),
+            acceptor_greeting_node_id.as_slice(),
+        ]
+        .concat();
+
+        let vote_response_term = 3_u64.to_ne_bytes();
+        let vote_given = 1_u64.to_ne_bytes();
+        let flush_lsn = 0x539_u64.to_ne_bytes();
+        let truncate_lsn = 0x539_u64.to_ne_bytes();
+        let th_len = 1_u32.to_ne_bytes();
+        let th_term = 2_u64.to_ne_bytes();
+        let th_lsn = 0x539_u64.to_ne_bytes();
+        let timeline_start_lsn = 0x539_u64.to_ne_bytes();
+        let vote_response = [
+            voting_tag.as_slice(),
+            vote_response_term.as_slice(),
+            vote_given.as_slice(),
+            flush_lsn.as_slice(),
+            truncate_lsn.as_slice(),
+            th_len.as_slice(),
+            th_term.as_slice(),
+            th_lsn.as_slice(),
+            timeline_start_lsn.as_slice(),
+        ]
+        .concat();
+
         let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
             wait_events: Cell::new(WaitEventsData {
                 sk: std::ptr::null_mut(),
                 event_mask: 0,
             }),
-            expected_messages: vec![
-                // TODO: When updating Postgres versions, this test will cause
-                // problems. Postgres version in message needs updating.
-                //
-                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
-                vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
-                    147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
-                    188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
-                ],
-                // VoteRequest(VoteRequest { term: 3 })
-                vec![
-                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0,
-                ],
-            ],
+            expected_messages: vec![proposer_greeting, vote_request],
             expected_ptr: AtomicUsize::new(0),
-            safekeeper_replies: vec![
-                // Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
-                vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
-                ],
-                // VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
-                vec![
-                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
-                    5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
-                    0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
-                ],
-            ],
+            safekeeper_replies: vec![acceptor_greeting, vote_response],
             replies_ptr: AtomicUsize::new(0),
             sync_channel: sender,
             shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),

From 6744ed19d8cc8cd09c6ccbbf66953e6ebb7a480d Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 13 Aug 2024 12:20:11 -0500
Subject: [PATCH 36/55] Update Postgres 14 to 14.13

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/revisions.json | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 3fd7a45f8a..b6910406e2 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 3fd7a45f8aae85c080df6329e3c85887b7f3a737
+Subproject commit b6910406e2d05a2c94baa2e530ec882733047759
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 6e3e489b5d..c2b5fb8915 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -8,7 +8,7 @@
     "46b4b235f38413ab5974bb22c022f9b829257674"
   ],
   "v14": [
-    "14.12",
-    "3fd7a45f8aae85c080df6329e3c85887b7f3a737"
+    "14.13",
+    "b6910406e2d05a2c94baa2e530ec882733047759"
   ]
 }

From 66db381dc9b9238618165c7ef36fa29a0577806c Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 13 Aug 2024 12:27:05 -0500
Subject: [PATCH 37/55] Update Postgres 15 to 15.8

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vendor/postgres-v15   | 2 +-
 vendor/revisions.json | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 46b4b235f3..76063bff63 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 46b4b235f38413ab5974bb22c022f9b829257674
+Subproject commit 76063bff638ccce7afa99fc9037ac51338b9823d
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c2b5fb8915..2921372c24 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -4,8 +4,8 @@
     "47a9122a5a150a3217fafd3f3d4fe8e020ea718a"
   ],
   "v15": [
-    "15.7",
-    "46b4b235f38413ab5974bb22c022f9b829257674"
+    "15.8",
+    "76063bff638ccce7afa99fc9037ac51338b9823d"
   ],
   "v14": [
     "14.13",

From 2f8d548a125c490b29eb4a6ab4d79ce358300e74 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 13 Aug 2024 12:44:01 -0500
Subject: [PATCH 38/55] Update Postgres 16 to 16.4

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vendor/postgres-v16   | 2 +-
 vendor/revisions.json | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 47a9122a5a..8efa089aa7 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 47a9122a5a150a3217fafd3f3d4fe8e020ea718a
+Subproject commit 8efa089aa7786381543a4f9efc69b92d43eab8c0
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 2921372c24..50cc99c2f1 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v16": [
-    "16.3",
-    "47a9122a5a150a3217fafd3f3d4fe8e020ea718a"
+    "16.4",
+    "8efa089aa7786381543a4f9efc69b92d43eab8c0"
   ],
   "v15": [
     "15.8",

From f7ab3ffcb781c14bf35da8260518456d00cea04d Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 13 Aug 2024 14:05:06 -0500
Subject: [PATCH 39/55] Check that TERM != dumb before using colors in
 pre-commit.py

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 pre-commit.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pre-commit.py b/pre-commit.py
index c5ed63ac44..ae432e8225 100755
--- a/pre-commit.py
+++ b/pre-commit.py
@@ -2,6 +2,7 @@
 
 import argparse
 import enum
+import os
 import subprocess
 import sys
 from typing import List
@@ -93,7 +94,7 @@ if __name__ == "__main__":
         "--no-color",
         action="store_true",
         help="disable colored output",
-        default=not sys.stdout.isatty(),
+        default=not sys.stdout.isatty() or os.getenv("TERM") == "dumb",
     )
     args = parser.parse_args()
 

From dbdb8a1187d28cf98c93c9cc39c348db6d7e98f1 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 23 Aug 2024 09:15:55 +0300
Subject: [PATCH 40/55] Document how to use "git merge" for PostgreSQL minor
 version upgrades. (#8692)

Our new policy is to use the "rebase" method and slice all the Neon
commits into a nice patch set when doing a new major version, and use
"merge" method on minor version upgrades on the release branches.

"git merge" preserves the git history of Neon commits on the Postgres
branches. While it's nice to rebase all the Neon changes to a logical
patch set against upstream, having to do it between every minor release
is a fair amount work, and it loses the history, and is more
error-prone.
---
 docs/updating-postgres.md | 33 +++++++++++----------------------
 1 file changed, 11 insertions(+), 22 deletions(-)

diff --git a/docs/updating-postgres.md b/docs/updating-postgres.md
index 1868bbf5f7..7913b0a9e2 100644
--- a/docs/updating-postgres.md
+++ b/docs/updating-postgres.md
@@ -21,30 +21,21 @@ _Example: 15.4 is the new minor version to upgrade to from 15.3._
 1. Create a new branch based on the stable branch you are updating.
 
     ```shell
-    git checkout -b my-branch REL_15_STABLE_neon
+    git checkout -b my-branch-15 REL_15_STABLE_neon
     ```
 
-1. Tag the last commit on the stable branch you are updating.
+1. Find the upstream release tags you're looking for. They are of the form `REL_X_Y`.
 
-    ```shell
-    git tag REL_15_3_neon
-    ```
-
-1. Push the new tag to the Neon Postgres repository.
-
-    ```shell
-    git push origin REL_15_3_neon
-    ```
-
-1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
-
-1. Rebase the branch you created on the tag and resolve any conflicts.
+1. Merge the upstream tag into the branch you created on the tag and resolve any conflicts.
 
     ```shell
     git fetch upstream REL_15_4
-    git rebase REL_15_4
+    git merge REL_15_4
     ```
 
+    In the commit message of the merge commit, mention if there were
+    any non-trivial conflicts or other issues.
+
 1. Run the Postgres test suite to make sure our commits have not affected
 Postgres in a negative way.
 
@@ -57,7 +48,7 @@ Postgres in a negative way.
 1. Push your branch to the Neon Postgres repository.
 
     ```shell
-    git push origin my-branch
+    git push origin my-branch-15
     ```
 
 1. Clone the Neon repository if you have not done so already.
@@ -74,7 +65,7 @@ branch.
 1. Update the Git submodule.
 
     ```shell
-    git submodule set-branch --branch my-branch vendor/postgres-v15
+    git submodule set-branch --branch my-branch-15 vendor/postgres-v15
     git submodule update --remote vendor/postgres-v15
     ```
 
@@ -89,14 +80,12 @@ minor Postgres release.
 
 1. Create a pull request, and wait for CI to go green.
 
-1. Force push the rebased Postgres branches into the Neon Postgres repository.
+1. Push the Postgres branches with the merge commits into the Neon Postgres repository.
 
     ```shell
-    git push --force origin my-branch:REL_15_STABLE_neon
+    git push origin my-branch-15:REL_15_STABLE_neon
     ```
 
-    It may require disabling various branch protections.
-
 1. Update your Neon PR to point at the branches.
 
     ```shell

From d8ca495eae816ddfd5a06fed4e1e668fe1edad91 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Fri, 23 Aug 2024 12:48:26 +0200
Subject: [PATCH 41/55] Require poetry >=1.8 (#8812)

This was already a requirement for installing the python packages after
https://github.com/neondatabase/neon/pull/8609 got merged, so this
updates the documentation to reflect that.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f01442da5d..735edef0fc 100644
--- a/README.md
+++ b/README.md
@@ -126,7 +126,7 @@ make -j`sysctl -n hw.logicalcpu` -s
 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.
 
 To run the integration tests or Python scripts (not required to use the code), install
-Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
+Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory.
 
 
 #### Running neon database

From e80ab8fd6a99bf46463695986b9f19e2cb06c8d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 23 Aug 2024 13:14:14 +0200
Subject: [PATCH 42/55] Update serde_json to 1.0.125 (#8813)

Updates `serde_json` to `1.0.125`, rolling out speedups added by a
serde_json contributor.

Release [link](https://github.com/serde-rs/json/releases/tag/1.0.125).
Blog post
[link](https://purplesyringa.moe/blog/i-sped-up-serde-json-strings-by-20-percent/).
---
 Cargo.lock | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a506da8c02..250427da2b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5590,11 +5590,12 @@ dependencies = [
 
 [[package]]
 name = "serde_json"
-version = "1.0.96"
+version = "1.0.125"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1"
+checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed"
 dependencies = [
  "itoa",
+ "memchr",
  "ryu",
  "serde",
 ]

From e62cd9e121928eca4f1f6b3ded4f5deb7e0a6110 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 23 Aug 2024 14:29:11 +0100
Subject: [PATCH 43/55] CI(autocomment): add arch to build type (#8809)

## Problem

Failed / flaky tests for different arches don't have any difference in
GitHub Autocomment

## Summary of changes
- Add arch to build type for GitHub autocomment
---
 scripts/comment-test-report.js | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/scripts/comment-test-report.js b/scripts/comment-test-report.js
index f42262cf48..e8e0b3c23a 100755
--- a/scripts/comment-test-report.js
+++ b/scripts/comment-test-report.js
@@ -68,16 +68,29 @@ const parseReportJson = async ({ reportJsonUrl, fetch }) => {
                     console.info(`Cannot get BUILD_TYPE and Postgres Version from test name: "${test.name}", defaulting to "release" and "14"`)
 
                     buildType = "release"
-                    pgVersion = "14"
+                    pgVersion = "16"
                 }
 
                 pgVersions.add(pgVersion)
 
+                // We use `arch` as it is returned by GitHub Actions
+                //  (RUNNER_ARCH env var): X86, X64, ARM, or ARM64
+                // Ref https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#default-environment-variables
+                let arch = ""
+                if (test.parameters.includes("'X64'")) {
+                    arch = "x86-64"
+                } else if (test.parameters.includes("'ARM64'")) {
+                    arch = "arm64"
+                } else {
+                    arch = "unknown"
+                }
+
                 // Removing build type and PostgreSQL version from the test name to make it shorter
                 const testName = test.name.replace(new RegExp(`${buildType}-pg${pgVersion}-?`), "").replace("[]", "")
                 test.pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${testName}`
                 test.pgVersion = pgVersion
                 test.buildType = buildType
+                test.arch = arch
 
                 if (test.status === "passed") {
                     passedTests[pgVersion][testName].push(test)
@@ -144,7 +157,7 @@ const reportSummary = async (params) => {
                 const links = []
                 for (const test of tests) {
                     const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}`
-                    links.push(`[${test.buildType}](${allureLink})`)
+                    links.push(`[${test.buildType}-${test.arch}](${allureLink})`)
                 }
                 summary += `- \`${testName}\`: ${links.join(", ")}\n`
             }
@@ -175,7 +188,7 @@ const reportSummary = async (params) => {
                     const links = []
                     for (const test of tests) {
                         const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}/retries`
-                        links.push(`[${test.buildType}](${allureLink})`)
+                        links.push(`[${test.buildType}-${test.arch}](${allureLink})`)
                     }
                     summary += `- \`${testName}\`: ${links.join(", ")}\n`
                 }

From 6a74bcadecd0ce4f088b5a22c6183ff980559d87 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 23 Aug 2024 09:32:00 -0400
Subject: [PATCH 44/55] feat(pageserver): remove features=testing restriction
 for compact (#8815)

A small PR to make it possible to run force compaction in staging for
btm-gc compaction testing.

Part of https://github.com/neondatabase/neon/issues/8002

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index a4da8506d6..4635e76ea9 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1706,11 +1706,6 @@ async fn timeline_compact_handler(
         flags |= CompactFlags::ForceImageLayerCreation;
     }
     if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
-        if !cfg!(feature = "testing") {
-            return Err(ApiError::InternalServerError(anyhow!(
-                "enhanced_gc_bottom_most_compaction is only available in testing mode"
-            )));
-        }
         flags |= CompactFlags::EnhancedGcBottomMostCompaction;
     }
     let wait_until_uploaded =
@@ -2942,7 +2937,7 @@ pub fn make_router(
         )
         .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
-            |r| testing_api_handler("run timeline compaction", r, timeline_compact_handler),
+            |r| api_handler(r, timeline_compact_handler),
         )
         .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",

From bc8cfe1b5567715995b884231bc2785a32307ce8 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 23 Aug 2024 09:42:45 -0400
Subject: [PATCH 45/55] fix(pageserver): l0 check criteria (#8797)

close https://github.com/neondatabase/neon/issues/8579

## Summary of changes

The `is_l0` check now takes both layer key range and the layer type.
This allows us to have image layers covering the full key range in
btm-most compaction (upcoming PR). However, we still don't allow delta
layers to cover the full key range, and I will make btm-most compaction
to generate delta layers with the key range of the keys existing in the
layer instead of `Key::MIN..Key::HACK_MAX` (upcoming PR).


Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/layer_map.rs                | 10 +++++-----
 pageserver/src/tenant/storage_layer/layer.rs      |  5 ++++-
 pageserver/src/tenant/storage_layer/layer_name.rs |  4 ++++
 pageserver/src/tenant/timeline.rs                 |  9 ++++++---
 storage_scrubber/src/checks.rs                    |  2 +-
 5 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 844f117ea2..707233b003 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -464,7 +464,7 @@ impl LayerMap {
     pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
         // TODO: See #3869, resulting #4088, attempted fix and repro #4094
 
-        if Self::is_l0(&layer_desc.key_range) {
+        if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
             self.l0_delta_layers.push(layer_desc.clone().into());
         }
 
@@ -483,7 +483,7 @@ impl LayerMap {
         self.historic
             .remove(historic_layer_coverage::LayerKey::from(layer_desc));
         let layer_key = layer_desc.key();
-        if Self::is_l0(&layer_desc.key_range) {
+        if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
             let len_before = self.l0_delta_layers.len();
             let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
             l0_delta_layers.retain(|other| other.key() != layer_key);
@@ -600,8 +600,8 @@ impl LayerMap {
     }
 
     /// Check if the key range resembles that of an L0 layer.
-    pub fn is_l0(key_range: &Range<Key>) -> bool {
-        key_range == &(Key::MIN..Key::MAX)
+    pub fn is_l0(key_range: &Range<Key>, is_delta_layer: bool) -> bool {
+        is_delta_layer && key_range == &(Key::MIN..Key::MAX)
     }
 
     /// This function determines which layers are counted in `count_deltas`:
@@ -628,7 +628,7 @@ impl LayerMap {
     ///      than just the current partition_range.
     pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
         // Case 1
-        if !Self::is_l0(&layer.key_range) {
+        if !Self::is_l0(&layer.key_range, layer.is_delta) {
             return true;
         }
 
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 774f97e1d9..2607b574e7 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1296,7 +1296,10 @@ impl LayerInner {
                 lsn_end: lsn_range.end,
                 remote: !resident,
                 access_stats,
-                l0: crate::tenant::layer_map::LayerMap::is_l0(&self.layer_desc().key_range),
+                l0: crate::tenant::layer_map::LayerMap::is_l0(
+                    &self.layer_desc().key_range,
+                    self.layer_desc().is_delta,
+                ),
             }
         } else {
             let lsn = self.desc.image_layer_lsn();
diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
index f33ca076ab..47ae556279 100644
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -256,6 +256,10 @@ impl LayerName {
             LayerName::Delta(layer) => &layer.key_range,
         }
     }
+
+    pub fn is_delta(&self) -> bool {
+        matches!(self, LayerName::Delta(_))
+    }
 }
 
 impl fmt::Display for LayerName {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index dc9cddea43..b33e436fce 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3002,7 +3002,10 @@ impl Timeline {
         // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
         //   the layer is likely to be covered by an image layer during compaction.
         layers.sort_by_key(|(desc, _meta, _atime)| {
-            std::cmp::Reverse((!LayerMap::is_l0(&desc.key_range), desc.lsn_range.end))
+            std::cmp::Reverse((
+                !LayerMap::is_l0(&desc.key_range, desc.is_delta),
+                desc.lsn_range.end,
+            ))
         });
 
         let layers = layers
@@ -4585,7 +4588,7 @@ impl Timeline {
                 // for compact_level0_phase1 creating an L0, which does not happen in practice
                 // because we have not implemented L0 => L0 compaction.
                 duplicated_layers.insert(l.layer_desc().key());
-            } else if LayerMap::is_l0(&l.layer_desc().key_range) {
+            } else if LayerMap::is_l0(&l.layer_desc().key_range, l.layer_desc().is_delta) {
                 return Err(CompactionError::Other(anyhow::anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction.")));
             } else {
                 insert_layers.push(l.clone());
@@ -5877,7 +5880,7 @@ mod tests {
             };
 
             // Apart from L0s, newest Layers should come first
-            if !LayerMap::is_l0(layer.name.key_range()) {
+            if !LayerMap::is_l0(layer.name.key_range(), layer.name.is_delta()) {
                 assert!(layer_lsn <= last_lsn);
                 last_lsn = layer_lsn;
             }
diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index b35838bcf7..08b0f06ebf 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -150,7 +150,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
 
                             if response.is_err() {
                                 // Object is not present.
-                                let is_l0 = LayerMap::is_l0(layer.key_range());
+                                let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta());
 
                                 let msg = format!(
                                     "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})",

From 73286e6b9f8a0ba4fb00dd4b44e613963b62cb21 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 23 Aug 2024 16:43:08 +0300
Subject: [PATCH 46/55] test: copy dict to avoid error on retry (#8811)

there is no "const" in python, so when we modify the global dict, it
will remain that way on the retry. fix to not have it influence other
tests which might be run on the same runner.

evidence:
<https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8625/10513146742/index.html#/testresult/453c4ce05ada7496>
---
 test_runner/fixtures/pageserver/utils.py       | 17 +++++++++++------
 test_runner/regress/test_s3_restore.py         |  6 +++---
 test_runner/regress/test_storage_controller.py |  4 ++--
 test_runner/regress/test_tenant_delete.py      |  8 ++++----
 test_runner/regress/test_timeline_delete.py    |  4 ++--
 5 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index b75a480a63..a74fef6a60 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -430,12 +430,17 @@ def enable_remote_storage_versioning(
     return response
 
 
-MANY_SMALL_LAYERS_TENANT_CONFIG = {
-    "gc_period": "0s",
-    "compaction_period": "0s",
-    "checkpoint_distance": 1024**2,
-    "image_creation_threshold": 100,
-}
+def many_small_layers_tenant_config() -> Dict[str, Any]:
+    """
+    Create a new dict to avoid issues with deleting from the global value.
+    In python, the global is mutable.
+    """
+    return {
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        "checkpoint_distance": 1024**2,
+        "image_creation_threshold": 100,
+    }
 
 
 def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py
index 9992647e56..c1a80a54bc 100644
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -8,9 +8,9 @@ from fixtures.neon_fixtures import (
     PgBin,
 )
 from fixtures.pageserver.utils import (
-    MANY_SMALL_LAYERS_TENANT_CONFIG,
     assert_prefix_empty,
     enable_remote_storage_versioning,
+    many_small_layers_tenant_config,
     wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
@@ -33,7 +33,7 @@ def test_tenant_s3_restore(
 
     # change it back after initdb, recovery doesn't work if the two
     # index_part.json uploads happen at same second or too close to each other.
-    initial_tenant_conf = MANY_SMALL_LAYERS_TENANT_CONFIG
+    initial_tenant_conf = many_small_layers_tenant_config()
     del initial_tenant_conf["checkpoint_distance"]
 
     env = neon_env_builder.init_start(initial_tenant_conf)
@@ -50,7 +50,7 @@ def test_tenant_s3_restore(
     tenant_id = env.initial_tenant
 
     # now lets create the small layers
-    ps_http.set_tenant_config(tenant_id, MANY_SMALL_LAYERS_TENANT_CONFIG)
+    ps_http.set_tenant_config(tenant_id, many_small_layers_tenant_config())
 
     # Default tenant and the one we created
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 94d71a7677..b3464b0c91 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -23,11 +23,11 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
-    MANY_SMALL_LAYERS_TENANT_CONFIG,
     assert_prefix_empty,
     assert_prefix_not_empty,
     enable_remote_storage_versioning,
     list_prefix,
+    many_small_layers_tenant_config,
     remote_storage_delete_key,
     timeline_delete_wait_completed,
 )
@@ -654,7 +654,7 @@ def test_storage_controller_s3_time_travel_recovery(
         tenant_id,
         shard_count=2,
         shard_stripe_size=8192,
-        tenant_config=MANY_SMALL_LAYERS_TENANT_CONFIG,
+        tenant_config=many_small_layers_tenant_config(),
     )
 
     # Check that the consistency check passes
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index dadf5ca672..448a28dc31 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -9,9 +9,9 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
-    MANY_SMALL_LAYERS_TENANT_CONFIG,
     assert_prefix_empty,
     assert_prefix_not_empty,
+    many_small_layers_tenant_config,
     wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
@@ -76,7 +76,7 @@ def test_tenant_delete_smoke(
 
     env.neon_cli.create_tenant(
         tenant_id=tenant_id,
-        conf=MANY_SMALL_LAYERS_TENANT_CONFIG,
+        conf=many_small_layers_tenant_config(),
     )
 
     # Default tenant and the one we created
@@ -215,7 +215,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
     # (and there is no way to reconstruct the used remote storage kind)
     remote_storage_kind = RemoteStorageKind.MOCK_S3
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+    env = neon_env_builder.init_start(initial_tenant_conf=many_small_layers_tenant_config())
     ps_http = env.pageserver.http_client()
     tenant_id = env.initial_tenant
 
@@ -330,7 +330,7 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
 
     remote_storage_kind = RemoteStorageKind.MOCK_S3
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+    env = neon_env_builder.init_start(initial_tenant_conf=many_small_layers_tenant_config())
 
     ps_http = env.pageserver.http_client()
     # create a tenant separate from the main tenant so that we have one remaining
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 6d96dda391..328131cd08 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -16,9 +16,9 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
-    MANY_SMALL_LAYERS_TENANT_CONFIG,
     assert_prefix_empty,
     assert_prefix_not_empty,
+    many_small_layers_tenant_config,
     poll_for_remote_storage_iterations,
     timeline_delete_wait_completed,
     wait_for_last_record_lsn,
@@ -782,7 +782,7 @@ def test_timeline_delete_resumed_on_attach(
     remote_storage_kind = s3_storage()
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
 
-    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+    env = neon_env_builder.init_start(initial_tenant_conf=many_small_layers_tenant_config())
 
     tenant_id = env.initial_tenant
 

From bcc68a7866c633d74a482266bfe34053a093b9d8 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 23 Aug 2024 14:48:06 +0100
Subject: [PATCH 47/55] storcon_cli: add support for drain and fill operations
 (#8791)

## Problem
We have been naughty and curl-ed storcon to fix-up drains and fills.

## Summary of changes
Add support for starting/cancelling drain/fill operations via
`storcon_cli`.
---
 control_plane/storcon_cli/src/main.rs | 135 ++++++++++++++++++++++++--
 storage_controller/src/http.rs        |   1 -
 2 files changed, 126 insertions(+), 10 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index e27491c1c8..35510ccbca 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -147,9 +147,9 @@ enum Command {
         #[arg(long)]
         threshold: humantime::Duration,
     },
-    // Drain a set of specified pageservers by moving the primary attachments to pageservers
+    // Migrate away from a set of specified pageservers by moving the primary attachments to pageservers
     // outside of the specified set.
-    Drain {
+    BulkMigrate {
         // Set of pageserver node ids to drain.
         #[arg(long)]
         nodes: Vec<NodeId>,
@@ -163,6 +163,34 @@ enum Command {
         #[arg(long)]
         dry_run: Option<bool>,
     },
+    /// Start draining the specified pageserver.
+    /// The drain is complete when the schedulling policy returns to active.
+    StartDrain {
+        #[arg(long)]
+        node_id: NodeId,
+    },
+    /// Cancel draining the specified pageserver and wait for `timeout`
+    /// for the operation to be canceled. May be retried.
+    CancelDrain {
+        #[arg(long)]
+        node_id: NodeId,
+        #[arg(long)]
+        timeout: humantime::Duration,
+    },
+    /// Start filling the specified pageserver.
+    /// The drain is complete when the schedulling policy returns to active.
+    StartFill {
+        #[arg(long)]
+        node_id: NodeId,
+    },
+    /// Cancel filling the specified pageserver and wait for `timeout`
+    /// for the operation to be canceled. May be retried.
+    CancelFill {
+        #[arg(long)]
+        node_id: NodeId,
+        #[arg(long)]
+        timeout: humantime::Duration,
+    },
 }
 
 #[derive(Parser)]
@@ -249,6 +277,34 @@ impl FromStr for NodeAvailabilityArg {
     }
 }
 
+async fn wait_for_scheduling_policy<F>(
+    client: Client,
+    node_id: NodeId,
+    timeout: Duration,
+    f: F,
+) -> anyhow::Result<NodeSchedulingPolicy>
+where
+    F: Fn(NodeSchedulingPolicy) -> bool,
+{
+    let waiter = tokio::time::timeout(timeout, async move {
+        loop {
+            let node = client
+                .dispatch::<(), NodeDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/node/{node_id}"),
+                    None,
+                )
+                .await?;
+
+            if f(node.scheduling) {
+                return Ok::<NodeSchedulingPolicy, mgmt_api::Error>(node.scheduling);
+            }
+        }
+    });
+
+    Ok(waiter.await??)
+}
+
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
     let cli = Cli::parse();
@@ -628,7 +684,7 @@ async fn main() -> anyhow::Result<()> {
                 })
                 .await?;
         }
-        Command::Drain {
+        Command::BulkMigrate {
             nodes,
             concurrency,
             max_shards,
@@ -657,7 +713,7 @@ async fn main() -> anyhow::Result<()> {
             }
 
             if nodes.len() != node_to_drain_descs.len() {
-                anyhow::bail!("Drain requested for node which doesn't exist.")
+                anyhow::bail!("Bulk migration requested away from node which doesn't exist.")
             }
 
             node_to_fill_descs.retain(|desc| {
@@ -669,7 +725,7 @@ async fn main() -> anyhow::Result<()> {
             });
 
             if node_to_fill_descs.is_empty() {
-                anyhow::bail!("There are no nodes to drain to")
+                anyhow::bail!("There are no nodes to migrate to")
             }
 
             // Set the node scheduling policy to draining for the nodes which
@@ -690,7 +746,7 @@ async fn main() -> anyhow::Result<()> {
                     .await?;
             }
 
-            // Perform the drain: move each tenant shard scheduled on a node to
+            // Perform the migration: move each tenant shard scheduled on a node to
             // be drained to a node which is being filled. A simple round robin
             // strategy is used to pick the new node.
             let tenants = storcon_client
@@ -703,13 +759,13 @@ async fn main() -> anyhow::Result<()> {
 
             let mut selected_node_idx = 0;
 
-            struct DrainMove {
+            struct MigrationMove {
                 tenant_shard_id: TenantShardId,
                 from: NodeId,
                 to: NodeId,
             }
 
-            let mut moves: Vec<DrainMove> = Vec::new();
+            let mut moves: Vec<MigrationMove> = Vec::new();
 
             let shards = tenants
                 .into_iter()
@@ -739,7 +795,7 @@ async fn main() -> anyhow::Result<()> {
                     continue;
                 }
 
-                moves.push(DrainMove {
+                moves.push(MigrationMove {
                     tenant_shard_id: shard.tenant_shard_id,
                     from: shard
                         .node_attached
@@ -816,6 +872,67 @@ async fn main() -> anyhow::Result<()> {
                 failure
             );
         }
+        Command::StartDrain { node_id } => {
+            storcon_client
+                .dispatch::<(), ()>(
+                    Method::PUT,
+                    format!("control/v1/node/{node_id}/drain"),
+                    None,
+                )
+                .await?;
+            println!("Drain started for {node_id}");
+        }
+        Command::CancelDrain { node_id, timeout } => {
+            storcon_client
+                .dispatch::<(), ()>(
+                    Method::DELETE,
+                    format!("control/v1/node/{node_id}/drain"),
+                    None,
+                )
+                .await?;
+
+            println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
+
+            let final_policy =
+                wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
+                    use NodeSchedulingPolicy::*;
+                    matches!(sched, Active | PauseForRestart)
+                })
+                .await?;
+
+            println!(
+                "Drain was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
+            );
+        }
+        Command::StartFill { node_id } => {
+            storcon_client
+                .dispatch::<(), ()>(Method::PUT, format!("control/v1/node/{node_id}/fill"), None)
+                .await?;
+
+            println!("Fill started for {node_id}");
+        }
+        Command::CancelFill { node_id, timeout } => {
+            storcon_client
+                .dispatch::<(), ()>(
+                    Method::DELETE,
+                    format!("control/v1/node/{node_id}/fill"),
+                    None,
+                )
+                .await?;
+
+            println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
+
+            let final_policy =
+                wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
+                    use NodeSchedulingPolicy::*;
+                    matches!(sched, Active)
+                })
+                .await?;
+
+            println!(
+                "Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
+            );
+        }
     }
 
     Ok(())
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 7bbd1541cf..207bd5a1e6 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1074,7 +1074,6 @@ pub fn make_router(
                 RequestName("control_v1_metadata_health_list_outdated"),
             )
         })
-        // TODO(vlad): endpoint for cancelling drain and fill
         // Tenant Shard operations
         .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
             tenant_service_handler(

From 612b643315fbda4b489ae512b14d9bd66a4fbacb Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 23 Aug 2024 16:28:22 +0100
Subject: [PATCH 48/55] update diesel (#8816)

https://rustsec.org/advisories/RUSTSEC-2024-0365
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 250427da2b..441ca1ff86 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1672,9 +1672,9 @@ dependencies = [
 
 [[package]]
 name = "diesel"
-version = "2.2.1"
+version = "2.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
+checksum = "65e13bab2796f412722112327f3e575601a3e9cdcbe426f0d30dbf43f3f5dc71"
 dependencies = [
  "bitflags 2.4.1",
  "byteorder",

From f4cac1f30f096ceb8c1fa4a3281319883d10be6e Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Sat, 24 Aug 2024 00:38:42 +0800
Subject: [PATCH 49/55] impr(pageserver): error if keys are unordered in merge
 iter (#8818)

In case of corrupted delta layers, we can detect the corruption and bail
out the compaction.

## Summary of changes

* Detect wrong delta desc of key range
* Detect unordered deltas

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .../src/tenant/storage_layer/delta_layer.rs   | 16 +++++++++++
 .../src/tenant/storage_layer/image_layer.rs   | 15 +++++++++++
 .../tenant/storage_layer/merge_iterator.rs    | 27 ++++++++++++++++++-
 3 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 6c2391d72d..b1b5217f7f 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -232,6 +232,18 @@ pub struct DeltaLayerInner {
     max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }
 
+impl DeltaLayerInner {
+    pub(crate) fn layer_dbg_info(&self) -> String {
+        format!(
+            "delta {}..{} {}..{}",
+            self.key_range().start,
+            self.key_range().end,
+            self.lsn_range().start,
+            self.lsn_range().end
+        )
+    }
+}
+
 impl std::fmt::Debug for DeltaLayerInner {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("DeltaLayerInner")
@@ -1527,6 +1539,10 @@ pub struct DeltaLayerIterator<'a> {
 }
 
 impl<'a> DeltaLayerIterator<'a> {
+    pub(crate) fn layer_dbg_info(&self) -> String {
+        self.delta_layer.layer_dbg_info()
+    }
+
     /// Retrieve a batch of key-value pairs into the iterator buffer.
     async fn next_batch(&mut self) -> anyhow::Result<()> {
         assert!(self.key_values_batch.is_empty());
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 9a19e4e2c7..94120a4e3e 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -167,6 +167,17 @@ pub struct ImageLayerInner {
     max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }
 
+impl ImageLayerInner {
+    pub(crate) fn layer_dbg_info(&self) -> String {
+        format!(
+            "image {}..{} {}",
+            self.key_range().start,
+            self.key_range().end,
+            self.lsn()
+        )
+    }
+}
+
 impl std::fmt::Debug for ImageLayerInner {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("ImageLayerInner")
@@ -1024,6 +1035,10 @@ pub struct ImageLayerIterator<'a> {
 }
 
 impl<'a> ImageLayerIterator<'a> {
+    pub(crate) fn layer_dbg_info(&self) -> String {
+        self.image_layer.layer_dbg_info()
+    }
+
     /// Retrieve a batch of key-value pairs into the iterator buffer.
     async fn next_batch(&mut self) -> anyhow::Result<()> {
         assert!(self.key_values_batch.is_empty());
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index b4bd976033..d2c341e5ce 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -3,6 +3,7 @@ use std::{
     collections::{binary_heap, BinaryHeap},
 };
 
+use anyhow::bail;
 use pageserver_api::key::Key;
 use utils::lsn::Lsn;
 
@@ -26,6 +27,13 @@ impl<'a> LayerRef<'a> {
             Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)),
         }
     }
+
+    fn layer_dbg_info(&self) -> String {
+        match self {
+            Self::Image(x) => x.layer_dbg_info(),
+            Self::Delta(x) => x.layer_dbg_info(),
+        }
+    }
 }
 
 enum LayerIterRef<'a> {
@@ -40,6 +48,13 @@ impl LayerIterRef<'_> {
             Self::Image(x) => x.next().await,
         }
     }
+
+    fn layer_dbg_info(&self) -> String {
+        match self {
+            Self::Image(x) => x.layer_dbg_info(),
+            Self::Delta(x) => x.layer_dbg_info(),
+        }
+    }
 }
 
 /// This type plays several roles at once
@@ -75,6 +90,11 @@ impl<'a> PeekableLayerIterRef<'a> {
     async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
         let result = self.peeked.take();
         self.peeked = self.iter.next().await?;
+        if let (Some((k1, l1, _)), Some((k2, l2, _))) = (&self.peeked, &result) {
+            if (k1, l1) < (k2, l2) {
+                bail!("iterator is not ordered: {}", self.iter.layer_dbg_info());
+            }
+        }
         Ok(result)
     }
 }
@@ -178,7 +198,12 @@ impl<'a> IteratorWrapper<'a> {
         let iter = PeekableLayerIterRef::create(iter).await?;
         if let Some((k1, l1, _)) = iter.peek() {
             let (k2, l2) = first_key_lower_bound;
-            debug_assert!((k1, l1) >= (k2, l2));
+            if (k1, l1) < (k2, l2) {
+                bail!(
+                    "layer key range did not include the first key in the layer: {}",
+                    layer.layer_dbg_info()
+                );
+            }
         }
         *self = Self::Loaded { iter };
         Ok(())

From c1cb7a0fa0d0bb6b58aa0f3e0979905476a19225 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 23 Aug 2024 18:01:02 +0100
Subject: [PATCH 50/55] proxy: flesh out JWT verification code (#8805)

This change adds in the necessary verification steps for the JWT
payload, and adds per-role querying of JWKs as needed for #8736
---
 proxy/src/auth/backend/jwt.rs | 295 +++++++++++++++++++++++-----------
 1 file changed, 203 insertions(+), 92 deletions(-)

diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index e021a7e23f..49d5de16c3 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -1,15 +1,21 @@
-use std::{future::Future, sync::Arc, time::Duration};
+use std::{
+    future::Future,
+    sync::Arc,
+    time::{Duration, SystemTime},
+};
 
 use anyhow::{bail, ensure, Context};
 use arc_swap::ArcSwapOption;
 use dashmap::DashMap;
 use jose_jwk::crypto::KeyInfo;
+use serde::{Deserialize, Deserializer};
 use signature::Verifier;
 use tokio::time::Instant;
 
-use crate::{http::parse_json_body_with_limit, intern::EndpointIdInt};
+use crate::{context::RequestMonitoring, http::parse_json_body_with_limit, EndpointId, RoleName};
 
 // TODO(conrad): make these configurable.
+const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30);
 const MIN_RENEW: Duration = Duration::from_secs(30);
 const AUTO_RENEW: Duration = Duration::from_secs(300);
 const MAX_RENEW: Duration = Duration::from_secs(3600);
@@ -17,30 +23,56 @@ const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
 
 /// How to get the JWT auth rules
 pub trait FetchAuthRules: Clone + Send + Sync + 'static {
-    fn fetch_auth_rules(&self) -> impl Future<Output = anyhow::Result<AuthRules>> + Send;
+    fn fetch_auth_rules(
+        &self,
+        role_name: RoleName,
+    ) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
 }
 
-#[derive(Clone)]
-struct FetchAuthRulesFromCplane {
-    #[allow(dead_code)]
-    endpoint: EndpointIdInt,
-}
-
-impl FetchAuthRules for FetchAuthRulesFromCplane {
-    async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
-        Err(anyhow::anyhow!("not yet implemented"))
-    }
-}
-
-pub struct AuthRules {
-    jwks_urls: Vec<url::Url>,
+pub struct AuthRule {
+    pub id: String,
+    pub jwks_url: url::Url,
+    pub audience: Option<String>,
 }
 
 #[derive(Default)]
 pub struct JwkCache {
     client: reqwest::Client,
 
-    map: DashMap<EndpointIdInt, Arc<JwkCacheEntryLock>>,
+    map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
+}
+
+pub struct JwkCacheEntry {
+    /// Should refetch at least every hour to verify when old keys have been removed.
+    /// Should refetch when new key IDs are seen only every 5 minutes or so
+    last_retrieved: Instant,
+
+    /// cplane will return multiple JWKs urls that we need to scrape.
+    key_sets: ahash::HashMap<String, KeySet>,
+}
+
+impl JwkCacheEntry {
+    fn find_jwk_and_audience(&self, key_id: &str) -> Option<(&jose_jwk::Jwk, Option<&str>)> {
+        self.key_sets.values().find_map(|key_set| {
+            key_set
+                .find_key(key_id)
+                .map(|jwk| (jwk, key_set.audience.as_deref()))
+        })
+    }
+}
+
+struct KeySet {
+    jwks: jose_jwk::JwkSet,
+    audience: Option<String>,
+}
+
+impl KeySet {
+    fn find_key(&self, key_id: &str) -> Option<&jose_jwk::Jwk> {
+        self.jwks
+            .keys
+            .iter()
+            .find(|jwk| jwk.prm.kid.as_deref() == Some(key_id))
+    }
 }
 
 pub struct JwkCacheEntryLock {
@@ -57,15 +89,6 @@ impl Default for JwkCacheEntryLock {
     }
 }
 
-pub struct JwkCacheEntry {
-    /// Should refetch at least every hour to verify when old keys have been removed.
-    /// Should refetch when new key IDs are seen only every 5 minutes or so
-    last_retrieved: Instant,
-
-    /// cplane will return multiple JWKs urls that we need to scrape.
-    key_sets: ahash::HashMap<url::Url, jose_jwk::JwkSet>,
-}
-
 impl JwkCacheEntryLock {
     async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
         JwkRenewalPermit::acquire_permit(self).await
@@ -79,6 +102,7 @@ impl JwkCacheEntryLock {
         &self,
         _permit: JwkRenewalPermit<'_>,
         client: &reqwest::Client,
+        role_name: RoleName,
         auth_rules: &F,
     ) -> anyhow::Result<Arc<JwkCacheEntry>> {
         // double check that no one beat us to updating the cache.
@@ -91,20 +115,19 @@ impl JwkCacheEntryLock {
             }
         }
 
-        let rules = auth_rules.fetch_auth_rules().await?;
-        let mut key_sets = ahash::HashMap::with_capacity_and_hasher(
-            rules.jwks_urls.len(),
-            ahash::RandomState::new(),
-        );
+        let rules = auth_rules.fetch_auth_rules(role_name).await?;
+        let mut key_sets =
+            ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new());
         // TODO(conrad): run concurrently
         // TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284)
-        for url in rules.jwks_urls {
-            let req = client.get(url.clone());
+        for rule in rules {
+            let req = client.get(rule.jwks_url.clone());
             // TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
+            // TODO(conrad): We need to filter out URLs that point to local resources. Public internet only.
             match req.send().await.and_then(|r| r.error_for_status()) {
                 // todo: should we re-insert JWKs if we want to keep this JWKs URL?
                 // I expect these failures would be quite sparse.
-                Err(e) => tracing::warn!(?url, error=?e, "could not fetch JWKs"),
+                Err(e) => tracing::warn!(url=?rule.jwks_url, error=?e, "could not fetch JWKs"),
                 Ok(r) => {
                     let resp: http::Response<reqwest::Body> = r.into();
                     match parse_json_body_with_limit::<jose_jwk::JwkSet>(
@@ -113,9 +136,17 @@ impl JwkCacheEntryLock {
                     )
                     .await
                     {
-                        Err(e) => tracing::warn!(?url, error=?e, "could not decode JWKs"),
+                        Err(e) => {
+                            tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
+                        }
                         Ok(jwks) => {
-                            key_sets.insert(url, jwks);
+                            key_sets.insert(
+                                rule.id,
+                                KeySet {
+                                    jwks,
+                                    audience: rule.audience,
+                                },
+                            );
                         }
                     }
                 }
@@ -133,7 +164,9 @@ impl JwkCacheEntryLock {
 
     async fn get_or_update_jwk_cache<F: FetchAuthRules>(
         self: &Arc<Self>,
+        ctx: &RequestMonitoring,
         client: &reqwest::Client,
+        role_name: RoleName,
         fetch: &F,
     ) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
         let now = Instant::now();
@@ -141,18 +174,20 @@ impl JwkCacheEntryLock {
 
         // if we have no cached JWKs, try and get some
         let Some(cached) = guard else {
+            let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
             let permit = self.acquire_permit().await;
-            return self.renew_jwks(permit, client, fetch).await;
+            return self.renew_jwks(permit, client, role_name, fetch).await;
         };
 
         let last_update = now.duration_since(cached.last_retrieved);
 
         // check if the cached JWKs need updating.
         if last_update > MAX_RENEW {
+            let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
             let permit = self.acquire_permit().await;
 
             // it's been too long since we checked the keys. wait for them to update.
-            return self.renew_jwks(permit, client, fetch).await;
+            return self.renew_jwks(permit, client, role_name, fetch).await;
         }
 
         // every 5 minutes we should spawn a job to eagerly update the token.
@@ -164,7 +199,7 @@ impl JwkCacheEntryLock {
                 let client = client.clone();
                 let fetch = fetch.clone();
                 tokio::spawn(async move {
-                    if let Err(e) = entry.renew_jwks(permit, &client, &fetch).await {
+                    if let Err(e) = entry.renew_jwks(permit, &client, role_name, &fetch).await {
                         tracing::warn!(error=?e, "could not fetch JWKs in background job");
                     }
                 });
@@ -178,8 +213,10 @@ impl JwkCacheEntryLock {
 
     async fn check_jwt<F: FetchAuthRules>(
         self: &Arc<Self>,
-        jwt: String,
+        ctx: &RequestMonitoring,
+        jwt: &str,
         client: &reqwest::Client,
+        role_name: RoleName,
         fetch: &F,
     ) -> Result<(), anyhow::Error> {
         // JWT compact form is defined to be
@@ -189,36 +226,36 @@ impl JwkCacheEntryLock {
         let (header_payload, signature) = jwt
             .rsplit_once(".")
             .context("Provided authentication token is not a valid JWT encoding")?;
-        let (header, _payload) = header_payload
+        let (header, payload) = header_payload
             .split_once(".")
             .context("Provided authentication token is not a valid JWT encoding")?;
 
         let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
             .context("Provided authentication token is not a valid JWT encoding")?;
-        let header = serde_json::from_slice::<JWTHeader<'_>>(&header)
+        let header = serde_json::from_slice::<JwtHeader<'_>>(&header)
             .context("Provided authentication token is not a valid JWT encoding")?;
 
         let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
             .context("Provided authentication token is not a valid JWT encoding")?;
 
         ensure!(header.typ == "JWT");
-        let kid = header.kid.context("missing key id")?;
+        let kid = header.key_id.context("missing key id")?;
 
-        let mut guard = self.get_or_update_jwk_cache(client, fetch).await?;
+        let mut guard = self
+            .get_or_update_jwk_cache(ctx, client, role_name.clone(), fetch)
+            .await?;
 
         // get the key from the JWKs if possible. If not, wait for the keys to update.
-        let jwk = loop {
-            let jwk = guard
-                .key_sets
-                .values()
-                .flat_map(|jwks| &jwks.keys)
-                .find(|jwk| jwk.prm.kid.as_deref() == Some(kid));
-
-            match jwk {
+        let (jwk, expected_audience) = loop {
+            match guard.find_jwk_and_audience(kid) {
                 Some(jwk) => break jwk,
                 None if guard.last_retrieved.elapsed() > MIN_RENEW => {
+                    let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
+
                     let permit = self.acquire_permit().await;
-                    guard = self.renew_jwks(permit, client, fetch).await?;
+                    guard = self
+                        .renew_jwks(permit, client, role_name.clone(), fetch)
+                        .await?;
                 }
                 _ => {
                     bail!("jwk not found");
@@ -227,7 +264,7 @@ impl JwkCacheEntryLock {
         };
 
         ensure!(
-            jwk.is_supported(&header.alg),
+            jwk.is_supported(&header.algorithm),
             "signature algorithm not supported"
         );
 
@@ -241,31 +278,60 @@ impl JwkCacheEntryLock {
             key => bail!("unsupported key type {key:?}"),
         };
 
-        // TODO(conrad): verify iss, exp, nbf, etc...
+        let payload = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)
+            .context("Provided authentication token is not a valid JWT encoding")?;
+        let payload = serde_json::from_slice::<JwtPayload<'_>>(&payload)
+            .context("Provided authentication token is not a valid JWT encoding")?;
+
+        tracing::debug!(?payload, "JWT signature valid with claims");
+
+        match (expected_audience, payload.audience) {
+            // check the audience matches
+            (Some(aud1), Some(aud2)) => ensure!(aud1 == aud2, "invalid JWT token audience"),
+            // the audience is expected but is missing
+            (Some(_), None) => bail!("invalid JWT token audience"),
+            // we don't care for the audience field
+            (None, _) => {}
+        }
+
+        let now = SystemTime::now();
+
+        if let Some(exp) = payload.expiration {
+            ensure!(now < exp + CLOCK_SKEW_LEEWAY);
+        }
+
+        if let Some(nbf) = payload.not_before {
+            ensure!(nbf < now + CLOCK_SKEW_LEEWAY);
+        }
 
         Ok(())
     }
 }
 
 impl JwkCache {
-    pub async fn check_jwt(
+    pub async fn check_jwt<F: FetchAuthRules>(
         &self,
-        endpoint: EndpointIdInt,
-        jwt: String,
+        ctx: &RequestMonitoring,
+        endpoint: EndpointId,
+        role_name: RoleName,
+        fetch: &F,
+        jwt: &str,
     ) -> Result<(), anyhow::Error> {
         // try with just a read lock first
-        let entry = self.map.get(&endpoint).as_deref().map(Arc::clone);
+        let key = (endpoint, role_name.clone());
+        let entry = self.map.get(&key).as_deref().map(Arc::clone);
         let entry = match entry {
             Some(entry) => entry,
             None => {
                 // acquire a write lock after to insert.
-                let entry = self.map.entry(endpoint).or_default();
+                let entry = self.map.entry(key).or_default();
                 Arc::clone(&*entry)
             }
         };
 
-        let fetch = FetchAuthRulesFromCplane { endpoint };
-        entry.check_jwt(jwt, &self.client, &fetch).await
+        entry
+            .check_jwt(ctx, jwt, &self.client, role_name, fetch)
+            .await
     }
 }
 
@@ -315,13 +381,49 @@ fn verify_rsa_signature(
 
 /// <https://datatracker.ietf.org/doc/html/rfc7515#section-4.1>
 #[derive(serde::Deserialize, serde::Serialize)]
-struct JWTHeader<'a> {
+struct JwtHeader<'a> {
     /// must be "JWT"
+    #[serde(rename = "typ")]
     typ: &'a str,
     /// must be a supported alg
-    alg: jose_jwa::Algorithm,
+    #[serde(rename = "alg")]
+    algorithm: jose_jwa::Algorithm,
     /// key id, must be provided for our usecase
-    kid: Option<&'a str>,
+    #[serde(rename = "kid")]
+    key_id: Option<&'a str>,
+}
+
+/// <https://datatracker.ietf.org/doc/html/rfc7519#section-4.1>
+#[derive(serde::Deserialize, serde::Serialize, Debug)]
+struct JwtPayload<'a> {
+    /// Audience - Recipient for which the JWT is intended
+    #[serde(rename = "aud")]
+    audience: Option<&'a str>,
+    /// Expiration - Time after which the JWT expires
+    #[serde(deserialize_with = "numeric_date_opt", rename = "exp", default)]
+    expiration: Option<SystemTime>,
+    /// Not before - Time after which the JWT expires
+    #[serde(deserialize_with = "numeric_date_opt", rename = "nbf", default)]
+    not_before: Option<SystemTime>,
+
+    // the following entries are only extracted for the sake of debug logging.
+    /// Issuer of the JWT
+    #[serde(rename = "iss")]
+    issuer: Option<&'a str>,
+    /// Subject of the JWT (the user)
+    #[serde(rename = "sub")]
+    subject: Option<&'a str>,
+    /// Unique token identifier
+    #[serde(rename = "jti")]
+    jwt_id: Option<&'a str>,
+    /// Unique session identifier
+    #[serde(rename = "sid")]
+    session_id: Option<&'a str>,
+}
+
+fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result<Option<SystemTime>, D::Error> {
+    let d = <Option<u64>>::deserialize(d)?;
+    Ok(d.map(|n| SystemTime::UNIX_EPOCH + Duration::from_secs(n)))
 }
 
 struct JwkRenewalPermit<'a> {
@@ -388,6 +490,8 @@ impl Drop for JwkRenewalPermit<'_> {
 
 #[cfg(test)]
 mod tests {
+    use crate::RoleName;
+
     use super::*;
 
     use std::{future::IntoFuture, net::SocketAddr, time::SystemTime};
@@ -431,10 +535,10 @@ mod tests {
     }
 
     fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
-        let header = JWTHeader {
+        let header = JwtHeader {
             typ: "JWT",
-            alg: jose_jwa::Algorithm::Signing(sig),
-            kid: Some(&kid),
+            algorithm: jose_jwa::Algorithm::Signing(sig),
+            key_id: Some(&kid),
         };
         let body = typed_json::json! {{
             "exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600,
@@ -524,33 +628,40 @@ mod tests {
         struct Fetch(SocketAddr);
 
         impl FetchAuthRules for Fetch {
-            async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
-                Ok(AuthRules {
-                    jwks_urls: vec![
-                        format!("http://{}/foo", self.0).parse().unwrap(),
-                        format!("http://{}/bar", self.0).parse().unwrap(),
-                    ],
-                })
+            async fn fetch_auth_rules(
+                &self,
+                _role_name: RoleName,
+            ) -> anyhow::Result<Vec<AuthRule>> {
+                Ok(vec![
+                    AuthRule {
+                        id: "foo".to_owned(),
+                        jwks_url: format!("http://{}/foo", self.0).parse().unwrap(),
+                        audience: None,
+                    },
+                    AuthRule {
+                        id: "bar".to_owned(),
+                        jwks_url: format!("http://{}/bar", self.0).parse().unwrap(),
+                        audience: None,
+                    },
+                ])
             }
         }
 
+        let role_name = RoleName::from("user");
+
         let jwk_cache = Arc::new(JwkCacheEntryLock::default());
 
-        jwk_cache
-            .check_jwt(jwt1, &client, &Fetch(addr))
-            .await
-            .unwrap();
-        jwk_cache
-            .check_jwt(jwt2, &client, &Fetch(addr))
-            .await
-            .unwrap();
-        jwk_cache
-            .check_jwt(jwt3, &client, &Fetch(addr))
-            .await
-            .unwrap();
-        jwk_cache
-            .check_jwt(jwt4, &client, &Fetch(addr))
-            .await
-            .unwrap();
+        for token in [jwt1, jwt2, jwt3, jwt4] {
+            jwk_cache
+                .check_jwt(
+                    &RequestMonitoring::test(),
+                    &token,
+                    &client,
+                    role_name.clone(),
+                    &Fetch(addr),
+                )
+                .await
+                .unwrap();
+        }
     }
 }

From b65a95f12ef958a509e30f0d650a820b4e2e8c58 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 23 Aug 2024 18:32:56 +0100
Subject: [PATCH 51/55] controller: use PageserverUtilization for scheduling
 (#8711)

## Problem

Previously, the controller only used the shard counts for scheduling.
This works well when hosting only many-sharded tenants, but works much
less well when hosting single-sharded tenants that have a greater
deviation in size-per-shard.

Closes: https://github.com/neondatabase/neon/issues/7798

## Summary of changes

- Instead of UtilizationScore, carry the full PageserverUtilization
through into the Scheduler.
- Use the PageserverUtilization::score() instead of shard count when
ordering nodes in scheduling.

Q: Why did test_sharding_split_smoke need updating in this PR?
A: There's an interesting side effect during shard splits: because we do
not decrement the shard count in the utilization when we de-schedule the
shards from before the split, the controller will now prefer to pick
_different_ nodes for shards compared with which ones held secondaries
before the split. We could use our knowledge of splitting to fix up the
utilizations more actively in this situation, but I'm leaning toward
leaving the code simpler, as in practical systems the impact of one
shard on the utilization of a node should be fairly low (single digit
%).
---
 libs/pageserver_api/src/controller_api.rs     |  21 +-
 libs/pageserver_api/src/models/utilization.rs |  67 +++++-
 pageserver/src/metrics.rs                     |   8 +
 pageserver/src/tenant.rs                      |  18 +-
 pageserver/src/utilization.rs                 |  10 +-
 storage_controller/src/heartbeater.rs         |  10 +-
 storage_controller/src/node.rs                |  24 +-
 storage_controller/src/scheduler.rs           | 225 +++++++++++++++---
 storage_controller/src/service.rs             |  39 +--
 storage_controller/src/tenant_shard.rs        |   4 +-
 test_runner/regress/test_sharding.py          |  15 +-
 11 files changed, 340 insertions(+), 101 deletions(-)

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index a50707a1b8..a9a57d77ce 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -8,6 +8,7 @@ use std::time::{Duration, Instant};
 use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId};
 
+use crate::models::PageserverUtilization;
 use crate::{
     models::{ShardParameters, TenantConfig},
     shard::{ShardStripeSize, TenantShardId},
@@ -140,23 +141,11 @@ pub struct TenantShardMigrateRequest {
     pub node_id: NodeId,
 }
 
-/// Utilisation score indicating how good a candidate a pageserver
-/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
-/// Lower values are better.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
-pub struct UtilizationScore(pub u64);
-
-impl UtilizationScore {
-    pub fn worst() -> Self {
-        UtilizationScore(u64::MAX)
-    }
-}
-
-#[derive(Serialize, Clone, Copy, Debug)]
+#[derive(Serialize, Clone, Debug)]
 #[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
     // Normal, happy state
-    Active(UtilizationScore),
+    Active(PageserverUtilization),
     // Node is warming up, but we expect it to become available soon. Covers
     // the time span between the re-attach response being composed on the storage controller
     // and the first successful heartbeat after the processing of the re-attach response
@@ -195,7 +184,9 @@ impl From<NodeAvailabilityWrapper> for NodeAvailability {
         match val {
             // Assume the worst utilisation score to begin with. It will later be updated by
             // the heartbeats.
-            NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
+            NodeAvailabilityWrapper::Active => {
+                NodeAvailability::Active(PageserverUtilization::full())
+            }
             NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
             NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
         }
diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs
index 0fec221276..844a0cda5d 100644
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -38,7 +38,7 @@ pub struct PageserverUtilization {
     pub max_shard_count: u32,
 
     /// Cached result of [`Self::score`]
-    pub utilization_score: u64,
+    pub utilization_score: Option<u64>,
 
     /// When was this snapshot captured, pageserver local time.
     ///
@@ -50,6 +50,8 @@ fn unity_percent() -> Percent {
     Percent::new(0).unwrap()
 }
 
+pub type RawScore = u64;
+
 impl PageserverUtilization {
     const UTILIZATION_FULL: u64 = 1000000;
 
@@ -62,7 +64,7 @@ impl PageserverUtilization {
     /// - Negative values are forbidden
     /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
     ///   layer eviction.
-    pub fn score(&self) -> u64 {
+    pub fn score(&self) -> RawScore {
         let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
             * self.disk_usable_pct.get() as u64)
             / 100;
@@ -74,8 +76,30 @@ impl PageserverUtilization {
         std::cmp::max(disk_utilization_score, shard_utilization_score)
     }
 
-    pub fn refresh_score(&mut self) {
-        self.utilization_score = self.score();
+    pub fn cached_score(&mut self) -> RawScore {
+        match self.utilization_score {
+            None => {
+                let s = self.score();
+                self.utilization_score = Some(s);
+                s
+            }
+            Some(s) => s,
+        }
+    }
+
+    /// If a node is currently hosting more work than it can comfortably handle.  This does not indicate that
+    /// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
+    pub fn is_overloaded(score: RawScore) -> bool {
+        score >= Self::UTILIZATION_FULL
+    }
+
+    pub fn adjust_shard_count_max(&mut self, shard_count: u32) {
+        if self.shard_count < shard_count {
+            self.shard_count = shard_count;
+
+            // Dirty cache: this will be calculated next time someone retrives the score
+            self.utilization_score = None;
+        }
     }
 
     /// A utilization structure that has a full utilization score: use this as a placeholder when
@@ -88,7 +112,38 @@ impl PageserverUtilization {
             disk_usable_pct: Percent::new(100).unwrap(),
             shard_count: 1,
             max_shard_count: 1,
-            utilization_score: Self::UTILIZATION_FULL,
+            utilization_score: Some(Self::UTILIZATION_FULL),
+            captured_at: serde_system_time::SystemTime(SystemTime::now()),
+        }
+    }
+}
+
+/// Test helper
+pub mod test_utilization {
+    use super::PageserverUtilization;
+    use std::time::SystemTime;
+    use utils::{
+        serde_percent::Percent,
+        serde_system_time::{self},
+    };
+
+    // Parameters of the imaginary node used for test utilization instances
+    const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024;
+    const TEST_SHARDS_MAX: u32 = 1000;
+
+    /// Unit test helper.  Unconditionally compiled because cfg(test) doesn't carry across crates.  Do
+    /// not abuse this function from non-test code.
+    ///
+    /// Emulates a node with a 1000 shard limit and a 1TB disk.
+    pub fn simple(shard_count: u32, disk_wanted_bytes: u64) -> PageserverUtilization {
+        PageserverUtilization {
+            disk_usage_bytes: disk_wanted_bytes,
+            free_space_bytes: TEST_DISK_SIZE - std::cmp::min(disk_wanted_bytes, TEST_DISK_SIZE),
+            disk_wanted_bytes,
+            disk_usable_pct: Percent::new(100).unwrap(),
+            shard_count,
+            max_shard_count: TEST_SHARDS_MAX,
+            utilization_score: None,
             captured_at: serde_system_time::SystemTime(SystemTime::now()),
         }
     }
@@ -120,7 +175,7 @@ mod tests {
             disk_usage_bytes: u64::MAX,
             free_space_bytes: 0,
             disk_wanted_bytes: u64::MAX,
-            utilization_score: 13,
+            utilization_score: Some(13),
             disk_usable_pct: Percent::new(90).unwrap(),
             shard_count: 100,
             max_shard_count: 200,
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 0a1a22b6e8..1f8634df93 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1803,6 +1803,14 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::n
     .expect("failed to define a metric")
 });
 
+pub(crate) static NODE_UTILIZATION_SCORE: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_utilization_score",
+        "The utilization score we report to the storage controller for scheduling, where 0 is empty, 1000000 is full, and anything above is considered overloaded",
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_secondary_heatmap_total_size",
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2e19a46ac8..3a7afff211 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3741,13 +3741,21 @@ impl Tenant {
     /// less than this (via eviction and on-demand downloads), but this function enables
     /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
     /// by keeping important things on local disk.
+    ///
+    /// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
+    /// than they report here, due to layer eviction.  Tenants with many active branches may
+    /// actually use more than they report here.
     pub(crate) fn local_storage_wanted(&self) -> u64 {
-        let mut wanted = 0;
         let timelines = self.timelines.lock().unwrap();
-        for timeline in timelines.values() {
-            wanted += timeline.metrics.visible_physical_size_gauge.get();
-        }
-        wanted
+
+        // Heuristic: we use the max() of the timelines' visible sizes, rather than the sum.  This
+        // reflects the observation that on tenants with multiple large branches, typically only one
+        // of them is used actively enough to occupy space on disk.
+        timelines
+            .values()
+            .map(|t| t.metrics.visible_physical_size_gauge.get())
+            .max()
+            .unwrap_or(0)
     }
 }
 
diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs
index 3c48c84598..a0223f3bce 100644
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -9,7 +9,7 @@ use utils::serde_percent::Percent;
 
 use pageserver_api::models::PageserverUtilization;
 
-use crate::{config::PageServerConf, tenant::mgr::TenantManager};
+use crate::{config::PageServerConf, metrics::NODE_UTILIZATION_SCORE, tenant::mgr::TenantManager};
 
 pub(crate) fn regenerate(
     conf: &PageServerConf,
@@ -58,13 +58,13 @@ pub(crate) fn regenerate(
         disk_usable_pct,
         shard_count,
         max_shard_count: MAX_SHARDS,
-        utilization_score: 0,
+        utilization_score: None,
         captured_at: utils::serde_system_time::SystemTime(captured_at),
     };
 
-    doc.refresh_score();
-
-    // TODO: make utilization_score into a metric
+    // Initialize `PageserverUtilization::utilization_score`
+    let score = doc.cached_score();
+    NODE_UTILIZATION_SCORE.set(score);
 
     Ok(doc)
 }
diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index c0e27bafdb..b7e66d33eb 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -6,10 +6,7 @@ use std::{
 };
 use tokio_util::sync::CancellationToken;
 
-use pageserver_api::{
-    controller_api::{NodeAvailability, UtilizationScore},
-    models::PageserverUtilization,
-};
+use pageserver_api::{controller_api::NodeAvailability, models::PageserverUtilization};
 
 use thiserror::Error;
 use utils::id::NodeId;
@@ -147,7 +144,8 @@ impl HeartbeaterTask {
                 // goes through to the pageserver even when the node is marked offline.
                 // This doesn't impact the availability observed by [`crate::service::Service`].
                 let mut node_clone = node.clone();
-                node_clone.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+                node_clone
+                    .set_availability(NodeAvailability::Active(PageserverUtilization::full()));
 
                 async move {
                     let response = node_clone
@@ -179,7 +177,7 @@ impl HeartbeaterTask {
                         node.get_availability()
                     {
                         PageserverState::WarmingUp {
-                            started_at: last_seen_at,
+                            started_at: *last_seen_at,
                         }
                     } else {
                         PageserverState::Offline
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index ea765ca123..61a44daca9 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -92,15 +92,15 @@ impl Node {
         }
     }
 
-    pub(crate) fn get_availability(&self) -> NodeAvailability {
-        self.availability
+    pub(crate) fn get_availability(&self) -> &NodeAvailability {
+        &self.availability
     }
 
     pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
         use AvailabilityTransition::*;
         use NodeAvailability::WarmingUp;
 
-        match self.get_availability_transition(availability) {
+        match self.get_availability_transition(&availability) {
             ToActive => {
                 // Give the node a new cancellation token, effectively resetting it to un-cancelled.  Any
                 // users of previously-cloned copies of the node will still see the old cancellation
@@ -115,8 +115,8 @@ impl Node {
             Unchanged | ToWarmingUpFromOffline => {}
         }
 
-        if let (WarmingUp(crnt), WarmingUp(proposed)) = (self.availability, availability) {
-            self.availability = WarmingUp(std::cmp::max(crnt, proposed));
+        if let (WarmingUp(crnt), WarmingUp(proposed)) = (&self.availability, &availability) {
+            self.availability = WarmingUp(std::cmp::max(*crnt, *proposed));
         } else {
             self.availability = availability;
         }
@@ -126,12 +126,12 @@ impl Node {
     /// into a description of the transition.
     pub(crate) fn get_availability_transition(
         &self,
-        availability: NodeAvailability,
+        availability: &NodeAvailability,
     ) -> AvailabilityTransition {
         use AvailabilityTransition::*;
         use NodeAvailability::*;
 
-        match (self.availability, availability) {
+        match (&self.availability, availability) {
             (Offline, Active(_)) => ToActive,
             (Active(_), Offline) => ToOffline,
             (Active(_), WarmingUp(_)) => ToWarmingUpFromActive,
@@ -153,15 +153,15 @@ impl Node {
 
     /// Is this node elegible to have work scheduled onto it?
     pub(crate) fn may_schedule(&self) -> MaySchedule {
-        let score = match self.availability {
-            NodeAvailability::Active(score) => score,
+        let utilization = match &self.availability {
+            NodeAvailability::Active(u) => u.clone(),
             NodeAvailability::Offline | NodeAvailability::WarmingUp(_) => return MaySchedule::No,
         };
 
         match self.scheduling {
-            NodeSchedulingPolicy::Active => MaySchedule::Yes(score),
+            NodeSchedulingPolicy::Active => MaySchedule::Yes(utilization),
             NodeSchedulingPolicy::Draining => MaySchedule::No,
-            NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
+            NodeSchedulingPolicy::Filling => MaySchedule::Yes(utilization),
             NodeSchedulingPolicy::Pause => MaySchedule::No,
             NodeSchedulingPolicy::PauseForRestart => MaySchedule::No,
         }
@@ -285,7 +285,7 @@ impl Node {
     pub(crate) fn describe(&self) -> NodeDescribeResponse {
         NodeDescribeResponse {
             id: self.id,
-            availability: self.availability.into(),
+            availability: self.availability.clone().into(),
             scheduling: self.scheduling,
             listen_http_addr: self.listen_http_addr.clone(),
             listen_http_port: self.listen_http_port,
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 843159010d..060e3cc6ca 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -1,6 +1,6 @@
 use crate::{node::Node, tenant_shard::TenantShard};
 use itertools::Itertools;
-use pageserver_api::controller_api::UtilizationScore;
+use pageserver_api::models::PageserverUtilization;
 use serde::Serialize;
 use std::collections::HashMap;
 use utils::{http::error::ApiError, id::NodeId};
@@ -20,9 +20,9 @@ impl From<ScheduleError> for ApiError {
     }
 }
 
-#[derive(Serialize, Eq, PartialEq)]
+#[derive(Serialize)]
 pub enum MaySchedule {
-    Yes(UtilizationScore),
+    Yes(PageserverUtilization),
     No,
 }
 
@@ -282,6 +282,28 @@ impl Scheduler {
                 node.shard_count -= 1;
             }
         }
+
+        // Maybe update PageserverUtilization
+        match update {
+            RefCountUpdate::AddSecondary | RefCountUpdate::Attach => {
+                // Referencing the node: if this takes our shard_count above the utilzation structure's
+                // shard count, then artifically bump it: this ensures that the scheduler immediately
+                // recognizes that this node has more work on it, without waiting for the next heartbeat
+                // to update the utilization.
+                if let MaySchedule::Yes(utilization) = &mut node.may_schedule {
+                    utilization.adjust_shard_count_max(node.shard_count as u32);
+                }
+            }
+            RefCountUpdate::PromoteSecondary
+            | RefCountUpdate::Detach
+            | RefCountUpdate::RemoveSecondary
+            | RefCountUpdate::DemoteAttached => {
+                // De-referencing the node: leave the utilization's shard_count at a stale higher
+                // value until some future heartbeat after we have physically removed this shard
+                // from the node: this prevents the scheduler over-optimistically trying to schedule
+                // more work onto the node before earlier detaches are done.
+            }
+        }
     }
 
     // Check if the number of shards attached to a given node is lagging below
@@ -326,7 +348,18 @@ impl Scheduler {
         use std::collections::hash_map::Entry::*;
         match self.nodes.entry(node.get_id()) {
             Occupied(mut entry) => {
-                entry.get_mut().may_schedule = node.may_schedule();
+                // Updates to MaySchedule are how we receive updated PageserverUtilization: adjust these values
+                // to account for any shards scheduled on the controller but not yet visible to the pageserver.
+                let mut may_schedule = node.may_schedule();
+                match &mut may_schedule {
+                    MaySchedule::Yes(utilization) => {
+                        utilization.adjust_shard_count_max(entry.get().shard_count as u32);
+                    }
+                    MaySchedule::No => { // Nothing to tweak
+                    }
+                }
+
+                entry.get_mut().may_schedule = may_schedule;
             }
             Vacant(entry) => {
                 entry.insert(SchedulerNode {
@@ -363,7 +396,7 @@ impl Scheduler {
                 let may_schedule = self
                     .nodes
                     .get(node_id)
-                    .map(|n| n.may_schedule != MaySchedule::No)
+                    .map(|n| !matches!(n.may_schedule, MaySchedule::No))
                     .unwrap_or(false);
                 (*node_id, may_schedule)
             })
@@ -383,7 +416,7 @@ impl Scheduler {
     /// the same tenant on the same node.  This is a soft constraint: the context will never
     /// cause us to fail to schedule a shard.
     pub(crate) fn schedule_shard(
-        &self,
+        &mut self,
         hard_exclude: &[NodeId],
         context: &ScheduleContext,
     ) -> Result<NodeId, ScheduleError> {
@@ -391,31 +424,41 @@ impl Scheduler {
             return Err(ScheduleError::NoPageservers);
         }
 
-        let mut scores: Vec<(NodeId, AffinityScore, usize, usize)> = self
+        let mut scores: Vec<(NodeId, AffinityScore, u64, usize)> = self
             .nodes
-            .iter()
-            .filter_map(|(k, v)| {
-                if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
-                    None
-                } else {
-                    Some((
-                        *k,
-                        context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
-                        v.shard_count,
-                        v.attached_shard_count,
-                    ))
-                }
+            .iter_mut()
+            .filter_map(|(k, v)| match &mut v.may_schedule {
+                MaySchedule::No => None,
+                MaySchedule::Yes(_) if hard_exclude.contains(k) => None,
+                MaySchedule::Yes(utilization) => Some((
+                    *k,
+                    context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
+                    utilization.cached_score(),
+                    v.attached_shard_count,
+                )),
             })
             .collect();
 
+        // Exclude nodes whose utilization is critically high, if there are alternatives available.  This will
+        // cause us to violate affinity rules if it is necessary to avoid critically overloading nodes: for example
+        // we may place shards in the same tenant together on the same pageserver if all other pageservers are
+        // overloaded.
+        let non_overloaded_scores = scores
+            .iter()
+            .filter(|i| !PageserverUtilization::is_overloaded(i.2))
+            .copied()
+            .collect::<Vec<_>>();
+        if !non_overloaded_scores.is_empty() {
+            scores = non_overloaded_scores;
+        }
+
         // Sort by, in order of precedence:
         //  1st: Affinity score.  We should never pick a higher-score node if a lower-score node is available
-        //  2nd: Attached shard count.  Within nodes with the same affinity, we always pick the node with
-        //  the least number of attached shards.
-        //  3rd: Total shard count.  Within nodes with the same affinity and attached shard count, use nodes
-        //  with the lower total shard count.
+        //  2nd: Utilization score (this combines shard count and disk utilization)
+        //  3rd: Attached shard count.  When nodes have identical utilization (e.g. when populating some
+        //       empty nodes), this acts as an anti-affinity between attached shards.
         //  4th: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
-        scores.sort_by_key(|i| (i.1, i.3, i.2, i.0));
+        scores.sort_by_key(|i| (i.1, i.2, i.3, i.0));
 
         if scores.is_empty() {
             // After applying constraints, no pageservers were left.
@@ -429,7 +472,7 @@ impl Scheduler {
                 for (node_id, node) in &self.nodes {
                     tracing::info!(
                         "Node {node_id}: may_schedule={} shards={}",
-                        node.may_schedule != MaySchedule::No,
+                        !matches!(node.may_schedule, MaySchedule::No),
                         node.shard_count
                     );
                 }
@@ -469,7 +512,7 @@ impl Scheduler {
 pub(crate) mod test_utils {
 
     use crate::node::Node;
-    use pageserver_api::controller_api::{NodeAvailability, UtilizationScore};
+    use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization};
     use std::collections::HashMap;
     use utils::id::NodeId;
     /// Test helper: synthesize the requested number of nodes, all in active state.
@@ -486,7 +529,7 @@ pub(crate) mod test_utils {
                         format!("pghost-{i}"),
                         5432 + i as u16,
                     );
-                    node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+                    node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0)));
                     assert!(node.is_available());
                     node
                 })
@@ -497,6 +540,8 @@ pub(crate) mod test_utils {
 
 #[cfg(test)]
 mod tests {
+    use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization};
+
     use super::*;
 
     use crate::tenant_shard::IntentState;
@@ -557,4 +602,130 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    /// Test the PageserverUtilization's contribution to scheduling algorithm
+    fn scheduler_utilization() {
+        let mut nodes = test_utils::make_test_nodes(3);
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        // Need to keep these alive because they contribute to shard counts via RAII
+        let mut scheduled_intents = Vec::new();
+
+        let empty_context = ScheduleContext::default();
+
+        fn assert_scheduler_chooses(
+            expect_node: NodeId,
+            scheduled_intents: &mut Vec<IntentState>,
+            scheduler: &mut Scheduler,
+            context: &ScheduleContext,
+        ) {
+            let scheduled = scheduler.schedule_shard(&[], context).unwrap();
+            let mut intent = IntentState::new();
+            intent.set_attached(scheduler, Some(scheduled));
+            scheduled_intents.push(intent);
+            assert_eq!(scheduled, expect_node);
+        }
+
+        // Independent schedule calls onto empty nodes should round-robin, because each node's
+        // utilization's shard count is updated inline.  The order is determinsitic because when all other factors are
+        // equal, we order by node ID.
+        assert_scheduler_chooses(
+            NodeId(1),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &empty_context,
+        );
+        assert_scheduler_chooses(
+            NodeId(2),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &empty_context,
+        );
+        assert_scheduler_chooses(
+            NodeId(3),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &empty_context,
+        );
+
+        // Manually setting utilization higher should cause schedule calls to round-robin the other nodes
+        // which have equal utilization.
+        nodes
+            .get_mut(&NodeId(1))
+            .unwrap()
+            .set_availability(NodeAvailability::Active(test_utilization::simple(
+                10,
+                1024 * 1024 * 1024,
+            )));
+        scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
+
+        assert_scheduler_chooses(
+            NodeId(2),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &empty_context,
+        );
+        assert_scheduler_chooses(
+            NodeId(3),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &empty_context,
+        );
+        assert_scheduler_chooses(
+            NodeId(2),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &empty_context,
+        );
+        assert_scheduler_chooses(
+            NodeId(3),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &empty_context,
+        );
+
+        // The scheduler should prefer nodes with lower affinity score,
+        // even if they have higher utilization (as long as they aren't utilized at >100%)
+        let mut context_prefer_node1 = ScheduleContext::default();
+        context_prefer_node1.avoid(&[NodeId(2), NodeId(3)]);
+        assert_scheduler_chooses(
+            NodeId(1),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &context_prefer_node1,
+        );
+        assert_scheduler_chooses(
+            NodeId(1),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &context_prefer_node1,
+        );
+
+        // If a node is over-utilized, it will not be used even if affinity scores prefer it
+        nodes
+            .get_mut(&NodeId(1))
+            .unwrap()
+            .set_availability(NodeAvailability::Active(test_utilization::simple(
+                20000,
+                1024 * 1024 * 1024,
+            )));
+        scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
+        assert_scheduler_chooses(
+            NodeId(2),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &context_prefer_node1,
+        );
+        assert_scheduler_chooses(
+            NodeId(3),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &context_prefer_node1,
+        );
+
+        for mut intent in scheduled_intents {
+            intent.clear(&mut scheduler);
+        }
+    }
 }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 453e96bad3..4b0c556824 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -44,7 +44,7 @@ use pageserver_api::{
         NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
         TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
         TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
-        TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
+        TenantShardMigrateRequest, TenantShardMigrateResponse,
     },
     models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
 };
@@ -542,7 +542,7 @@ impl Service {
             let locked = self.inner.read().unwrap();
             locked.nodes.clone()
         };
-        let nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await;
+        let mut nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await;
 
         // List of tenants for which we will attempt to notify compute of their location at startup
         let mut compute_notifications = Vec::new();
@@ -556,10 +556,8 @@ impl Service {
             // Mark nodes online if they responded to us: nodes are offline by default after a restart.
             let mut new_nodes = (**nodes).clone();
             for (node_id, node) in new_nodes.iter_mut() {
-                if let Some(utilization) = nodes_online.get(node_id) {
-                    node.set_availability(NodeAvailability::Active(UtilizationScore(
-                        utilization.utilization_score,
-                    )));
+                if let Some(utilization) = nodes_online.remove(node_id) {
+                    node.set_availability(NodeAvailability::Active(utilization));
                     scheduler.node_upsert(node);
                 }
             }
@@ -925,9 +923,9 @@ impl Service {
             if let Ok(deltas) = res {
                 for (node_id, state) in deltas.0 {
                     let new_availability = match state {
-                        PageserverState::Available { utilization, .. } => NodeAvailability::Active(
-                            UtilizationScore(utilization.utilization_score),
-                        ),
+                        PageserverState::Available { utilization, .. } => {
+                            NodeAvailability::Active(utilization)
+                        }
                         PageserverState::WarmingUp { started_at } => {
                             NodeAvailability::WarmingUp(started_at)
                         }
@@ -936,14 +934,17 @@ impl Service {
                             // while the heartbeat round was on-going. Hence, filter out
                             // offline transitions for WarmingUp nodes that are still within
                             // their grace period.
-                            if let Ok(NodeAvailability::WarmingUp(started_at)) =
-                                self.get_node(node_id).await.map(|n| n.get_availability())
+                            if let Ok(NodeAvailability::WarmingUp(started_at)) = self
+                                .get_node(node_id)
+                                .await
+                                .as_ref()
+                                .map(|n| n.get_availability())
                             {
                                 let now = Instant::now();
-                                if now - started_at >= self.config.max_warming_up_interval {
+                                if now - *started_at >= self.config.max_warming_up_interval {
                                     NodeAvailability::Offline
                                 } else {
-                                    NodeAvailability::WarmingUp(started_at)
+                                    NodeAvailability::WarmingUp(*started_at)
                                 }
                             } else {
                                 NodeAvailability::Offline
@@ -1625,7 +1626,7 @@ impl Service {
         // This Node is a mutable local copy: we will set it active so that we can use its
         // API client to reconcile with the node.  The Node in [`Self::nodes`] will get updated
         // later.
-        node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+        node.set_availability(NodeAvailability::Active(PageserverUtilization::full()));
 
         let configs = match node
             .with_client_retries(
@@ -2473,7 +2474,7 @@ impl Service {
         .await;
 
         let node = {
-            let locked = self.inner.read().unwrap();
+            let mut locked = self.inner.write().unwrap();
             // Just a sanity check to prevent misuse: the API expects that the tenant is fully
             // detached everywhere, and nothing writes to S3 storage. Here, we verify that,
             // but only at the start of the process, so it's really just to prevent operator
@@ -2500,7 +2501,7 @@ impl Service {
                     return Err(ApiError::InternalServerError(anyhow::anyhow!("We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}")));
                 }
             }
-            let scheduler = &locked.scheduler;
+            let scheduler = &mut locked.scheduler;
             // Right now we only perform the operation on a single node without parallelization
             // TODO fan out the operation to multiple nodes for better performance
             let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
@@ -4761,7 +4762,7 @@ impl Service {
         //
         // The transition we calculate here remains valid later in the function because we hold the op lock on the node:
         // nothing else can mutate its availability while we run.
-        let availability_transition = if let Some(input_availability) = availability {
+        let availability_transition = if let Some(input_availability) = availability.as_ref() {
             let (activate_node, availability_transition) = {
                 let locked = self.inner.read().unwrap();
                 let Some(node) = locked.nodes.get(&node_id) else {
@@ -4797,8 +4798,8 @@ impl Service {
             ));
         };
 
-        if let Some(availability) = &availability {
-            node.set_availability(*availability);
+        if let Some(availability) = availability.as_ref() {
+            node.set_availability(availability.clone());
         }
 
         if let Some(scheduling) = scheduling {
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 1fcc3c8547..30723a3b36 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -779,7 +779,7 @@ impl TenantShard {
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
     pub(crate) fn optimize_secondary(
         &self,
-        scheduler: &Scheduler,
+        scheduler: &mut Scheduler,
         schedule_context: &ScheduleContext,
     ) -> Option<ScheduleOptimization> {
         if self.intent.secondary.is_empty() {
@@ -1595,7 +1595,7 @@ pub(crate) mod tests {
         schedule_context.avoid(&shard_b.intent.all_pageservers());
         schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
 
-        let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context);
+        let optimization_a = shard_a.optimize_secondary(&mut scheduler, &schedule_context);
 
         // Since there is a node with no locations available, the node with two locations for the
         // same tenant should generate an optimization to move one away
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 1011a6fd22..bfd82242e9 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -394,6 +394,7 @@ def test_sharding_split_smoke(
 
     # Note which pageservers initially hold a shard after tenant creation
     pre_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)]
+    log.info("Pre-split pageservers: {pre_split_pageserver_ids}")
 
     # For pageservers holding a shard, validate their ingest statistics
     # reflect a proper splitting of the WAL.
@@ -555,9 +556,9 @@ def test_sharding_split_smoke(
     assert sum(total.values()) == split_shard_count * 2
     check_effective_tenant_config()
 
-    # More specific check: that we are fully balanced.  This is deterministic because
-    # the order in which we consider shards for optimization is deterministic, and the
-    # order of preference of nodes is also deterministic (lower node IDs win).
+    # More specific check: that we are fully balanced.  It is deterministic that we will get exactly
+    # one shard on each pageserver, because for these small shards the utilization metric is
+    # dominated by shard count.
     log.info(f"total: {total}")
     assert total == {
         1: 1,
@@ -577,8 +578,14 @@ def test_sharding_split_smoke(
         15: 1,
         16: 1,
     }
+
+    # The controller is not required to lay out the attached locations in any particular way, but
+    # all the pageservers that originally held an attached shard should still hold one, otherwise
+    # it would indicate that we had done some unnecessary migration.
     log.info(f"attached: {attached}")
-    assert attached == {1: 1, 2: 1, 3: 1, 5: 1, 6: 1, 7: 1, 9: 1, 11: 1}
+    for ps_id in pre_split_pageserver_ids:
+        log.info("Pre-split pageserver {ps_id} should still hold an attached location")
+        assert ps_id in attached
 
     # Ensure post-split pageserver locations survive a restart (i.e. the child shards
     # correctly wrote config to disk, and the storage controller responds correctly

From 0aa14509368d81acf253f406ffafd740bf13b01a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 23 Aug 2024 18:56:05 +0100
Subject: [PATCH 52/55] storage controller: enable timeline CRUD operations to
 run concurrently with reconciliation & make them safer (#8783)

## Problem

- If a reconciler was waiting to be able to notify computes about a
change, but the control plane was waiting for the controller to finish a
timeline creation/deletion, the overall system can deadlock.
- If a tenant shard was migrated concurrently with a timeline
creation/deletion, there was a risk that the timeline operation could be
applied to a non-latest-generation location, and thereby not really be
persistent. This has never happened in practice, but would eventually
happen at scale.

Closes: #8743

## Summary of changes

- Introduce `Service::tenant_remote_mutation` helper, which looks up
shards & generations and passes them into an inner function that may do
remote I/O to pageservers. Before returning success, this helper checks
that generations haven't incremented, to guarantee that changes are
persistent.
- Convert tenant_timeline_create, tenant_timeline_delete, and
tenant_timeline_detach_ancestor to use this helper.
- These functions no longer block on ensure_attached unless the tenant
was never attached at all, so they should make progress even if we can't
complete compute notifications.

This increases the database load from timeline/create operations, but
only with cheap read transactions.
---
 .../down.sql                                  |   2 +
 .../2024-08-23-170149_tenant_id_index/up.sql  |   2 +
 storage_controller/src/persistence.rs         |  38 ++
 storage_controller/src/service.rs             | 480 +++++++++---------
 test_runner/fixtures/neon_fixtures.py         |   4 +-
 .../regress/test_storage_controller.py        |  66 ++-
 6 files changed, 360 insertions(+), 232 deletions(-)
 create mode 100644 storage_controller/migrations/2024-08-23-170149_tenant_id_index/down.sql
 create mode 100644 storage_controller/migrations/2024-08-23-170149_tenant_id_index/up.sql

diff --git a/storage_controller/migrations/2024-08-23-170149_tenant_id_index/down.sql b/storage_controller/migrations/2024-08-23-170149_tenant_id_index/down.sql
new file mode 100644
index 0000000000..518c747100
--- /dev/null
+++ b/storage_controller/migrations/2024-08-23-170149_tenant_id_index/down.sql
@@ -0,0 +1,2 @@
+-- This file should undo anything in `up.sql`
+DROP INDEX tenant_shards_tenant_id;
\ No newline at end of file
diff --git a/storage_controller/migrations/2024-08-23-170149_tenant_id_index/up.sql b/storage_controller/migrations/2024-08-23-170149_tenant_id_index/up.sql
new file mode 100644
index 0000000000..dd6b37781a
--- /dev/null
+++ b/storage_controller/migrations/2024-08-23-170149_tenant_id_index/up.sql
@@ -0,0 +1,2 @@
+-- Your SQL goes here
+CREATE INDEX tenant_shards_tenant_id ON tenant_shards (tenant_id);
\ No newline at end of file
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 16df19026c..1a905753a1 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -91,6 +91,7 @@ pub(crate) enum DatabaseOperation {
     Detach,
     ReAttach,
     IncrementGeneration,
+    PeekGenerations,
     ListTenantShards,
     InsertTenantShards,
     UpdateTenantShard,
@@ -502,6 +503,43 @@ impl Persistence {
         Ok(Generation::new(g as u32))
     }
 
+    /// When we want to call out to the running shards for a tenant, e.g. during timeline CRUD operations,
+    /// we need to know where the shard is attached, _and_ the generation, so that we can re-check the generation
+    /// afterwards to confirm that our timeline CRUD operation is truly persistent (it must have happened in the
+    /// latest generation)
+    ///
+    /// If the tenant doesn't exist, an empty vector is returned.
+    ///
+    /// Output is sorted by shard number
+    pub(crate) async fn peek_generations(
+        &self,
+        filter_tenant_id: TenantId,
+    ) -> Result<Vec<(TenantShardId, Option<Generation>, Option<NodeId>)>, DatabaseError> {
+        use crate::schema::tenant_shards::dsl::*;
+        let rows = self
+            .with_measured_conn(DatabaseOperation::PeekGenerations, move |conn| {
+                let result = tenant_shards
+                    .filter(tenant_id.eq(filter_tenant_id.to_string()))
+                    .select(TenantShardPersistence::as_select())
+                    .order(shard_number)
+                    .load(conn)?;
+                Ok(result)
+            })
+            .await?;
+
+        Ok(rows
+            .into_iter()
+            .map(|p| {
+                (
+                    p.get_tenant_shard_id()
+                        .expect("Corrupt tenant shard id in database"),
+                    p.generation.map(|g| Generation::new(g as u32)),
+                    p.generation_pageserver.map(|n| NodeId(n as u64)),
+                )
+            })
+            .collect())
+    }
+
     #[allow(non_local_definitions)]
     /// For use when updating a persistent property of a tenant, such as its config or placement_policy.
     ///
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 4b0c556824..7daa1e4f5f 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2854,82 +2854,67 @@ impl Service {
         .await;
         failpoint_support::sleep_millis_async!("tenant-create-timeline-shared-lock");
 
-        self.ensure_attached_wait(tenant_id).await?;
+        self.tenant_remote_mutation(tenant_id, move |mut targets| async move {
+            if targets.is_empty() {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant not found").into(),
+                ));
+            };
+            let shard_zero = targets.remove(0);
 
-        let mut targets = {
-            let locked = self.inner.read().unwrap();
-            let mut targets = Vec::new();
+            async fn create_one(
+                tenant_shard_id: TenantShardId,
+                node: Node,
+                jwt: Option<String>,
+                create_req: TimelineCreateRequest,
+            ) -> Result<TimelineInfo, ApiError> {
+                tracing::info!(
+                    "Creating timeline on shard {}/{}, attached to node {node}",
+                    tenant_shard_id,
+                    create_req.new_timeline_id,
+                );
+                let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
 
-            for (tenant_shard_id, shard) in
-                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
-            {
-                let node_id = shard.intent.get_attached().ok_or_else(|| {
-                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
-                })?;
-                let node = locked
-                    .nodes
-                    .get(&node_id)
-                    .expect("Pageservers may not be deleted while referenced");
-
-                targets.push((*tenant_shard_id, node.clone()));
+                client
+                    .timeline_create(tenant_shard_id, &create_req)
+                    .await
+                    .map_err(|e| passthrough_api_error(&node, e))
             }
-            targets
-        };
 
-        if targets.is_empty() {
-            return Err(ApiError::NotFound(
-                anyhow::anyhow!("Tenant not found").into(),
-            ));
-        };
-        let shard_zero = targets.remove(0);
-
-        async fn create_one(
-            tenant_shard_id: TenantShardId,
-            node: Node,
-            jwt: Option<String>,
-            create_req: TimelineCreateRequest,
-        ) -> Result<TimelineInfo, ApiError> {
-            tracing::info!(
-                "Creating timeline on shard {}/{}, attached to node {node}",
-                tenant_shard_id,
-                create_req.new_timeline_id,
-            );
-            let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
-
-            client
-                .timeline_create(tenant_shard_id, &create_req)
-                .await
-                .map_err(|e| passthrough_api_error(&node, e))
-        }
-
-        // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then
-        // use whatever LSN that shard picked when creating on subsequent shards.  We arbitrarily use shard zero as the shard
-        // that will get the first creation request, and propagate the LSN to all the >0 shards.
-        let timeline_info = create_one(
-            shard_zero.0,
-            shard_zero.1,
-            self.config.jwt_token.clone(),
-            create_req.clone(),
-        )
-        .await?;
-
-        // Propagate the LSN that shard zero picked, if caller didn't provide one
-        if create_req.ancestor_timeline_id.is_some() && create_req.ancestor_start_lsn.is_none() {
-            create_req.ancestor_start_lsn = timeline_info.ancestor_lsn;
-        }
-
-        // Create timeline on remaining shards with number >0
-        if !targets.is_empty() {
-            // If we had multiple shards, issue requests for the remainder now.
-            let jwt = &self.config.jwt_token;
-            self.tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| {
-                let create_req = create_req.clone();
-                Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req))
-            })
+            // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then
+            // use whatever LSN that shard picked when creating on subsequent shards.  We arbitrarily use shard zero as the shard
+            // that will get the first creation request, and propagate the LSN to all the >0 shards.
+            let timeline_info = create_one(
+                shard_zero.0,
+                shard_zero.1,
+                self.config.jwt_token.clone(),
+                create_req.clone(),
+            )
             .await?;
-        }
 
-        Ok(timeline_info)
+            // Propagate the LSN that shard zero picked, if caller didn't provide one
+            if create_req.ancestor_timeline_id.is_some() && create_req.ancestor_start_lsn.is_none()
+            {
+                create_req.ancestor_start_lsn = timeline_info.ancestor_lsn;
+            }
+
+            // Create timeline on remaining shards with number >0
+            if !targets.is_empty() {
+                // If we had multiple shards, issue requests for the remainder now.
+                let jwt = &self.config.jwt_token;
+                self.tenant_for_shards(
+                    targets.iter().map(|t| (t.0, t.1.clone())).collect(),
+                    |tenant_shard_id: TenantShardId, node: Node| {
+                        let create_req = create_req.clone();
+                        Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req))
+                    },
+                )
+                .await?;
+            }
+
+            Ok(timeline_info)
+        })
+        .await?
     }
 
     pub(crate) async fn tenant_timeline_detach_ancestor(
@@ -2946,107 +2931,87 @@ impl Service {
         )
         .await;
 
-        self.ensure_attached_wait(tenant_id).await?;
-
-        let targets = {
-            let locked = self.inner.read().unwrap();
-            let mut targets = Vec::new();
-
-            for (tenant_shard_id, shard) in
-                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
-            {
-                let node_id = shard.intent.get_attached().ok_or_else(|| {
-                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
-                })?;
-                let node = locked
-                    .nodes
-                    .get(&node_id)
-                    .expect("Pageservers may not be deleted while referenced");
-
-                targets.push((*tenant_shard_id, node.clone()));
+        self.tenant_remote_mutation(tenant_id, move |targets| async move {
+            if targets.is_empty() {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant not found").into(),
+                ));
             }
-            targets
-        };
 
-        if targets.is_empty() {
-            return Err(ApiError::NotFound(
-                anyhow::anyhow!("Tenant not found").into(),
-            ));
-        }
+            async fn detach_one(
+                tenant_shard_id: TenantShardId,
+                timeline_id: TimelineId,
+                node: Node,
+                jwt: Option<String>,
+            ) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> {
+                tracing::info!(
+                    "Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
+                );
 
-        async fn detach_one(
-            tenant_shard_id: TenantShardId,
-            timeline_id: TimelineId,
-            node: Node,
-            jwt: Option<String>,
-        ) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> {
-            tracing::info!(
-                "Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
-            );
+                let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
 
-            let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
+                client
+                    .timeline_detach_ancestor(tenant_shard_id, timeline_id)
+                    .await
+                    .map_err(|e| {
+                        use mgmt_api::Error;
 
-            client
-                .timeline_detach_ancestor(tenant_shard_id, timeline_id)
-                .await
-                .map_err(|e| {
-                    use mgmt_api::Error;
-
-                    match e {
-                        // no ancestor (ever)
-                        Error::ApiError(StatusCode::CONFLICT, msg) => ApiError::Conflict(format!(
-                            "{node}: {}",
-                            msg.strip_prefix("Conflict: ").unwrap_or(&msg)
-                        )),
-                        // too many ancestors
-                        Error::ApiError(StatusCode::BAD_REQUEST, msg) => {
-                            ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}"))
+                        match e {
+                            // no ancestor (ever)
+                            Error::ApiError(StatusCode::CONFLICT, msg) => ApiError::Conflict(format!(
+                                "{node}: {}",
+                                msg.strip_prefix("Conflict: ").unwrap_or(&msg)
+                            )),
+                            // too many ancestors
+                            Error::ApiError(StatusCode::BAD_REQUEST, msg) => {
+                                ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}"))
+                            }
+                            Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, msg) => {
+                                // avoid turning these into conflicts to remain compatible with
+                                // pageservers, 500 errors are sadly retryable with timeline ancestor
+                                // detach
+                                ApiError::InternalServerError(anyhow::anyhow!("{node}: {msg}"))
+                            }
+                            // rest can be mapped as usual
+                            other => passthrough_api_error(&node, other),
                         }
-                        Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, msg) => {
-                            // avoid turning these into conflicts to remain compatible with
-                            // pageservers, 500 errors are sadly retryable with timeline ancestor
-                            // detach
-                            ApiError::InternalServerError(anyhow::anyhow!("{node}: {msg}"))
-                        }
-                        // rest can be mapped as usual
-                        other => passthrough_api_error(&node, other),
-                    }
+                    })
+                    .map(|res| (tenant_shard_id.shard_number, res))
+            }
+
+            // no shard needs to go first/last; the operation should be idempotent
+            let mut results = self
+                .tenant_for_shards(targets, |tenant_shard_id, node| {
+                    futures::FutureExt::boxed(detach_one(
+                        tenant_shard_id,
+                        timeline_id,
+                        node,
+                        self.config.jwt_token.clone(),
+                    ))
                 })
-                .map(|res| (tenant_shard_id.shard_number, res))
-        }
+                .await?;
 
-        // no shard needs to go first/last; the operation should be idempotent
-        let mut results = self
-            .tenant_for_shards(targets, |tenant_shard_id, node| {
-                futures::FutureExt::boxed(detach_one(
-                    tenant_shard_id,
-                    timeline_id,
-                    node,
-                    self.config.jwt_token.clone(),
-                ))
-            })
-            .await?;
+            let any = results.pop().expect("we must have at least one response");
 
-        let any = results.pop().expect("we must have at least one response");
+            let mismatching = results
+                .iter()
+                .filter(|(_, res)| res != &any.1)
+                .collect::<Vec<_>>();
+            if !mismatching.is_empty() {
+                // this can be hit by races which should not happen because operation lock on cplane
+                let matching = results.len() - mismatching.len();
+                tracing::error!(
+                    matching,
+                    compared_against=?any,
+                    ?mismatching,
+                    "shards returned different results"
+                );
 
-        let mismatching = results
-            .iter()
-            .filter(|(_, res)| res != &any.1)
-            .collect::<Vec<_>>();
-        if !mismatching.is_empty() {
-            // this can be hit by races which should not happen because operation lock on cplane
-            let matching = results.len() - mismatching.len();
-            tracing::error!(
-                matching,
-                compared_against=?any,
-                ?mismatching,
-                "shards returned different results"
-            );
+                return Err(ApiError::InternalServerError(anyhow::anyhow!("pageservers returned mixed results for ancestor detach; manual intervention is required.")));
+            }
 
-            return Err(ApiError::InternalServerError(anyhow::anyhow!("pageservers returned mixed results for ancestor detach; manual intervention is required.")));
-        }
-
-        Ok(any.1)
+            Ok(any.1)
+        }).await?
     }
 
     /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation.
@@ -3117,6 +3082,84 @@ impl Service {
         results
     }
 
+    /// Helper for safely working with the shards in a tenant remotely on pageservers, for example
+    /// when creating and deleting timelines:
+    /// - Makes sure shards are attached somewhere if they weren't already
+    /// - Looks up the shards and the nodes where they were most recently attached
+    /// - Guarantees that after the inner function returns, the shards' generations haven't moved on: this
+    ///   ensures that the remote operation acted on the most recent generation, and is therefore durable.
+    async fn tenant_remote_mutation<R, O, F>(
+        &self,
+        tenant_id: TenantId,
+        op: O,
+    ) -> Result<R, ApiError>
+    where
+        O: FnOnce(Vec<(TenantShardId, Node)>) -> F,
+        F: std::future::Future<Output = R>,
+    {
+        let target_gens = {
+            let mut targets = Vec::new();
+
+            // Load the currently attached pageservers for the latest generation of each shard.  This can
+            // run concurrently with reconciliations, and it is not guaranteed that the node we find here
+            // will still be the latest when we're done: we will check generations again at the end of
+            // this function to handle that.
+            let generations = self.persistence.peek_generations(tenant_id).await?;
+            let generations = if generations.iter().any(|i| i.1.is_none()) {
+                // One or more shards is not attached to anything: maybe this is a new tenant?  Wait for
+                // it to reconcile.
+                self.ensure_attached_wait(tenant_id).await?;
+                self.persistence.peek_generations(tenant_id).await?
+            } else {
+                generations
+            };
+
+            let locked = self.inner.read().unwrap();
+            for (tenant_shard_id, generation, generation_pageserver) in generations {
+                let node_id = generation_pageserver.ok_or(ApiError::Conflict(
+                    "Tenant not currently attached".to_string(),
+                ))?;
+                let node = locked
+                    .nodes
+                    .get(&node_id)
+                    .ok_or(ApiError::Conflict(format!(
+                        "Raced with removal of node {node_id}"
+                    )))?;
+                targets.push((tenant_shard_id, node.clone(), generation));
+            }
+
+            targets
+        };
+
+        let targets = target_gens.iter().map(|t| (t.0, t.1.clone())).collect();
+        let result = op(targets).await;
+
+        // Post-check: are all the generations of all the shards the same as they were initially?  This proves that
+        // our remote operation executed on the latest generation and is therefore persistent.
+        {
+            let latest_generations = self.persistence.peek_generations(tenant_id).await?;
+            if latest_generations
+                .into_iter()
+                .map(|g| (g.0, g.1))
+                .collect::<Vec<_>>()
+                != target_gens
+                    .into_iter()
+                    .map(|i| (i.0, i.2))
+                    .collect::<Vec<_>>()
+            {
+                // We raced with something that incremented the generation, and therefore cannot be
+                // confident that our actions are persistent (they might have hit an old generation).
+                //
+                // This is safe but requires a retry: ask the client to do that by giving them a 503 response.
+                return Err(ApiError::ResourceUnavailable(
+                    "Tenant attachment changed, please retry".into(),
+                ));
+            }
+        }
+
+        Ok(result)
+    }
+
     pub(crate) async fn tenant_timeline_delete(
         &self,
         tenant_id: TenantId,
@@ -3130,83 +3173,62 @@ impl Service {
         )
         .await;
 
-        self.ensure_attached_wait(tenant_id).await?;
-
-        let mut targets = {
-            let locked = self.inner.read().unwrap();
-            let mut targets = Vec::new();
-
-            for (tenant_shard_id, shard) in
-                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
-            {
-                let node_id = shard.intent.get_attached().ok_or_else(|| {
-                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
-                })?;
-                let node = locked
-                    .nodes
-                    .get(&node_id)
-                    .expect("Pageservers may not be deleted while referenced");
-
-                targets.push((*tenant_shard_id, node.clone()));
+        self.tenant_remote_mutation(tenant_id, move |mut targets| async move {
+            if targets.is_empty() {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant not found").into(),
+                ));
             }
-            targets
-        };
+            let shard_zero = targets.remove(0);
 
-        if targets.is_empty() {
-            return Err(ApiError::NotFound(
-                anyhow::anyhow!("Tenant not found").into(),
-            ));
-        }
-        let shard_zero = targets.remove(0);
+            async fn delete_one(
+                tenant_shard_id: TenantShardId,
+                timeline_id: TimelineId,
+                node: Node,
+                jwt: Option<String>,
+            ) -> Result<StatusCode, ApiError> {
+                tracing::info!(
+                    "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
+                );
 
-        async fn delete_one(
-            tenant_shard_id: TenantShardId,
-            timeline_id: TimelineId,
-            node: Node,
-            jwt: Option<String>,
-        ) -> Result<StatusCode, ApiError> {
-            tracing::info!(
-                "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
-            );
+                let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
+                client
+                    .timeline_delete(tenant_shard_id, timeline_id)
+                    .await
+                    .map_err(|e| {
+                        ApiError::InternalServerError(anyhow::anyhow!(
+                            "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
+                        ))
+                    })
+            }
 
-            let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
-            client
-                .timeline_delete(tenant_shard_id, timeline_id)
-                .await
-                .map_err(|e| {
-                    ApiError::InternalServerError(anyhow::anyhow!(
-                        "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
+            let statuses = self
+                .tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| {
+                    Box::pin(delete_one(
+                        tenant_shard_id,
+                        timeline_id,
+                        node,
+                        self.config.jwt_token.clone(),
                     ))
                 })
-        }
+                .await?;
 
-        let statuses = self
-            .tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| {
-                Box::pin(delete_one(
-                    tenant_shard_id,
-                    timeline_id,
-                    node,
-                    self.config.jwt_token.clone(),
-                ))
-            })
+            // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero
+            if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) {
+                return Ok(StatusCode::ACCEPTED);
+            }
+
+            // Delete shard zero last: this is not strictly necessary, but since a caller's GET on a timeline will be routed
+            // to shard zero, it gives a more obvious behavior that a GET returns 404 once the deletion is done.
+            let shard_zero_status = delete_one(
+                shard_zero.0,
+                timeline_id,
+                shard_zero.1,
+                self.config.jwt_token.clone(),
+            )
             .await?;
-
-        // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero
-        if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) {
-            return Ok(StatusCode::ACCEPTED);
-        }
-
-        // Delete shard zero last: this is not strictly necessary, but since a caller's GET on a timeline will be routed
-        // to shard zero, it gives a more obvious behavior that a GET returns 404 once the deletion is done.
-        let shard_zero_status = delete_one(
-            shard_zero.0,
-            timeline_id,
-            shard_zero.1,
-            self.config.jwt_token.clone(),
-        )
-        .await?;
-
-        Ok(shard_zero_status)
+            Ok(shard_zero_status)
+        }).await?
     }
 
     /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 2bb698f175..92febfec9b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2284,7 +2284,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
             self.allowed_errors,
         )
 
-    def pageserver_api(self) -> PageserverHttpClient:
+    def pageserver_api(self, *args, **kwargs) -> PageserverHttpClient:
         """
         The storage controller implements a subset of the pageserver REST API, for mapping
         per-tenant actions into per-shard actions (e.g. timeline creation).  Tests should invoke those
@@ -2293,7 +2293,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         auth_token = None
         if self.auth_enabled:
             auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API)
-        return PageserverHttpClient(self.port, lambda: True, auth_token)
+        return PageserverHttpClient(self.port, lambda: True, auth_token, *args, **kwargs)
 
     def request(self, method, *args, **kwargs) -> requests.Response:
         resp = requests.request(method, *args, **kwargs)
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index b3464b0c91..03eb7628be 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -21,7 +21,7 @@ from fixtures.neon_fixtures import (
     TokenScope,
     last_flush_lsn_upload,
 )
-from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
     assert_prefix_empty,
     assert_prefix_not_empty,
@@ -41,6 +41,7 @@ from mypy_boto3_s3.type_defs import (
     ObjectTypeDef,
 )
 from pytest_httpserver import HTTPServer
+from urllib3 import Retry
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
 
@@ -2266,3 +2267,66 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB
 
     # allow for small delay between actually having cancelled and being able reconfigure again
     wait_until(4, 0.5, reconfigure_node_again)
+
+
+def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder):
+    """
+    The storage controller is meant to handle the case where a timeline CRUD operation races
+    with a generation-incrementing change to the tenant: this should trigger a retry so that
+    the operation lands on the highest-generation'd tenant location.
+    """
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+    tenant_id = TenantId.generate()
+    env.storage_controller.tenant_create(tenant_id)
+
+    # Set up a failpoint so that a timeline creation will be very slow
+    failpoint = "timeline-creation-after-uninit"
+    for ps in env.pageservers:
+        ps.http_client().configure_failpoints((failpoint, "sleep(10000)"))
+
+    # Start a timeline creation in the background
+    create_timeline_id = TimelineId.generate()
+    futs = []
+    with concurrent.futures.ThreadPoolExecutor(
+        max_workers=2 + len(env.pageservers) + len(env.safekeepers)
+    ) as executor:
+        futs.append(
+            executor.submit(
+                env.storage_controller.pageserver_api(
+                    retries=Retry(
+                        status=0,
+                        connect=0,  # Disable retries: we want to see the 503
+                    )
+                ).timeline_create,
+                PgVersion.NOT_SET,
+                tenant_id,
+                create_timeline_id,
+            )
+        )
+
+        def has_hit_failpoint():
+            assert any(
+                ps.log_contains(f"at failpoint {failpoint}") is not None for ps in env.pageservers
+            )
+
+        wait_until(10, 1, has_hit_failpoint)
+
+        # Migrate the tenant while the timeline creation is in progress: this migration will complete once it
+        # can detach from the old pageserver, which will happen once the failpoint completes.
+        env.storage_controller.tenant_shard_migrate(
+            TenantShardId(tenant_id, 0, 0), env.pageservers[1].id
+        )
+
+        with pytest.raises(PageserverApiException, match="Tenant attachment changed, please retry"):
+            futs[0].result(timeout=20)
+
+    # Timeline creation should work when there isn't a concurrent migration, even though it's
+    # slow (our failpoint is still enabled)
+    env.storage_controller.pageserver_api(
+        retries=Retry(
+            status=0,
+            connect=0,  # Disable retries: we want to see the 503
+        )
+    ).timeline_create(PgVersion.NOT_SET, tenant_id, create_timeline_id)

From 701cb61b572eb0ef3cc29697f86aab36aafbba70 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 23 Aug 2024 19:48:06 +0100
Subject: [PATCH 53/55] proxy: local auth backend (#8806)

Adds a Local authentication backend. Updates http to extract JWT bearer
tokens and passes them to the local backend to validate.
---
 proxy/src/auth/backend.rs             | 34 +++++++++---
 proxy/src/auth/backend/local.rs       | 79 +++++++++++++++++++++++++++
 proxy/src/console/messages.rs         | 22 ++++++++
 proxy/src/console/provider.rs         |  1 +
 proxy/src/proxy/connect_compute.rs    |  7 +--
 proxy/src/serverless.rs               |  2 +-
 proxy/src/serverless/backend.rs       | 71 ++++++++++++++++++------
 proxy/src/serverless/conn_pool.rs     | 12 +++-
 proxy/src/serverless/sql_over_http.rs | 58 +++++++++++++++-----
 9 files changed, 240 insertions(+), 46 deletions(-)
 create mode 100644 proxy/src/auth/backend/local.rs

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 7592d076ec..ae72bc6de3 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -2,6 +2,7 @@ mod classic;
 mod hacks;
 pub mod jwt;
 mod link;
+pub mod local;
 
 use std::net::IpAddr;
 use std::sync::Arc;
@@ -9,6 +10,7 @@ use std::time::Duration;
 
 use ipnet::{Ipv4Net, Ipv6Net};
 pub use link::LinkAuthError;
+use local::LocalBackend;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_postgres::config::AuthKeys;
 use tracing::{info, warn};
@@ -68,6 +70,8 @@ pub enum BackendType<'a, T, D> {
     Console(MaybeOwned<'a, ConsoleBackend>, T),
     /// Authentication via a web browser.
     Link(MaybeOwned<'a, url::ApiUrl>, D),
+    /// Local proxy uses configured auth credentials and does not wake compute
+    Local(MaybeOwned<'a, LocalBackend>),
 }
 
 pub trait TestBackend: Send + Sync + 'static {
@@ -93,6 +97,7 @@ impl std::fmt::Display for BackendType<'_, (), ()> {
                 ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
             },
             Self::Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            Self::Local(_) => fmt.debug_tuple("Local").finish(),
         }
     }
 }
@@ -104,6 +109,7 @@ impl<T, D> BackendType<'_, T, D> {
         match self {
             Self::Console(c, x) => BackendType::Console(MaybeOwned::Borrowed(c), x),
             Self::Link(c, x) => BackendType::Link(MaybeOwned::Borrowed(c), x),
+            Self::Local(l) => BackendType::Local(MaybeOwned::Borrowed(l)),
         }
     }
 }
@@ -116,6 +122,7 @@ impl<'a, T, D> BackendType<'a, T, D> {
         match self {
             Self::Console(c, x) => BackendType::Console(c, f(x)),
             Self::Link(c, x) => BackendType::Link(c, x),
+            Self::Local(l) => BackendType::Local(l),
         }
     }
 }
@@ -126,6 +133,7 @@ impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
         match self {
             Self::Console(c, x) => x.map(|x| BackendType::Console(c, x)),
             Self::Link(c, x) => Ok(BackendType::Link(c, x)),
+            Self::Local(l) => Ok(BackendType::Local(l)),
         }
     }
 }
@@ -157,6 +165,7 @@ impl ComputeUserInfo {
 pub enum ComputeCredentialKeys {
     Password(Vec<u8>),
     AuthKeys(AuthKeys),
+    None,
 }
 
 impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
@@ -289,7 +298,7 @@ async fn auth_quirks(
             ctx.set_endpoint_id(res.info.endpoint.clone());
             let password = match res.keys {
                 ComputeCredentialKeys::Password(p) => p,
-                ComputeCredentialKeys::AuthKeys(_) => {
+                ComputeCredentialKeys::AuthKeys(_) | ComputeCredentialKeys::None => {
                     unreachable!("password hack should return a password")
                 }
             };
@@ -401,6 +410,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
         match self {
             Self::Console(_, user_info) => user_info.endpoint_id.clone(),
             Self::Link(_, _) => Some("link".into()),
+            Self::Local(_) => Some("local".into()),
         }
     }
 
@@ -409,6 +419,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
         match self {
             Self::Console(_, user_info) => &user_info.user,
             Self::Link(_, _) => "link",
+            Self::Local(_) => "local",
         }
     }
 
@@ -450,6 +461,9 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
 
                 BackendType::Link(url, info)
             }
+            Self::Local(_) => {
+                return Err(auth::AuthError::bad_auth_method("invalid for local proxy"))
+            }
         };
 
         info!("user successfully authenticated");
@@ -465,6 +479,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
         match self {
             Self::Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
             Self::Link(_, _) => Ok(Cached::new_uncached(None)),
+            Self::Local(_) => Ok(Cached::new_uncached(None)),
         }
     }
 
@@ -475,6 +490,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
         match self {
             Self::Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             Self::Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+            Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
         }
     }
 }
@@ -488,13 +504,15 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
         match self {
             Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
             Self::Link(_, info) => Ok(Cached::new_uncached(info.clone())),
+            Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
         }
     }
 
-    fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
+    fn get_keys(&self) -> &ComputeCredentialKeys {
         match self {
-            Self::Console(_, creds) => Some(&creds.keys),
-            Self::Link(_, _) => None,
+            Self::Console(_, creds) => &creds.keys,
+            Self::Link(_, _) => &ComputeCredentialKeys::None,
+            Self::Local(_) => &ComputeCredentialKeys::None,
         }
     }
 }
@@ -508,13 +526,15 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
         match self {
             Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
             Self::Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
+            Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
         }
     }
 
-    fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
+    fn get_keys(&self) -> &ComputeCredentialKeys {
         match self {
-            Self::Console(_, creds) => Some(&creds.keys),
-            Self::Link(_, _) => None,
+            Self::Console(_, creds) => &creds.keys,
+            Self::Link(_, _) => &ComputeCredentialKeys::None,
+            Self::Local(_) => &ComputeCredentialKeys::None,
         }
     }
 }
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
new file mode 100644
index 0000000000..6d18564dd6
--- /dev/null
+++ b/proxy/src/auth/backend/local.rs
@@ -0,0 +1,79 @@
+use std::{collections::HashMap, net::SocketAddr};
+
+use anyhow::Context;
+use arc_swap::ArcSwapOption;
+
+use crate::{
+    compute::ConnCfg,
+    console::{
+        messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo},
+        NodeInfo,
+    },
+    intern::{BranchIdInt, BranchIdTag, EndpointIdTag, InternId, ProjectIdInt, ProjectIdTag},
+    RoleName,
+};
+
+use super::jwt::{AuthRule, FetchAuthRules, JwkCache};
+
+pub struct LocalBackend {
+    pub jwks_cache: JwkCache,
+    pub postgres_addr: SocketAddr,
+    pub node_info: NodeInfo,
+}
+
+impl LocalBackend {
+    pub fn new(postgres_addr: SocketAddr) -> Self {
+        LocalBackend {
+            jwks_cache: JwkCache::default(),
+            postgres_addr,
+            node_info: NodeInfo {
+                config: {
+                    let mut cfg = ConnCfg::new();
+                    cfg.host(&postgres_addr.ip().to_string());
+                    cfg.port(postgres_addr.port());
+                    cfg
+                },
+                // TODO(conrad): make this better reflect compute info rather than endpoint info.
+                aux: MetricsAuxInfo {
+                    endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"),
+                    project_id: ProjectIdTag::get_interner().get_or_intern("local"),
+                    branch_id: BranchIdTag::get_interner().get_or_intern("local"),
+                    cold_start_info: ColdStartInfo::WarmCached,
+                },
+                allow_self_signed_compute: false,
+            },
+        }
+    }
+}
+
+#[derive(Clone, Copy)]
+pub struct StaticAuthRules;
+
+pub static JWKS_ROLE_MAP: ArcSwapOption<JwksRoleSettings> = ArcSwapOption::const_empty();
+
+#[derive(Debug, Clone)]
+pub struct JwksRoleSettings {
+    pub roles: HashMap<RoleName, EndpointJwksResponse>,
+    pub project_id: ProjectIdInt,
+    pub branch_id: BranchIdInt,
+}
+
+impl FetchAuthRules for StaticAuthRules {
+    async fn fetch_auth_rules(&self, role_name: RoleName) -> anyhow::Result<Vec<AuthRule>> {
+        let mappings = JWKS_ROLE_MAP.load();
+        let role_mappings = mappings
+            .as_deref()
+            .and_then(|m| m.roles.get(&role_name))
+            .context("JWKs settings for this role were not configured")?;
+        let mut rules = vec![];
+        for setting in &role_mappings.jwks {
+            rules.push(AuthRule {
+                id: setting.id.clone(),
+                jwks_url: setting.jwks_url.clone(),
+                audience: setting.jwt_audience.clone(),
+            });
+        }
+
+        Ok(rules)
+    }
+}
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index ac66e116d0..a7ccf076b0 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,11 +1,13 @@
 use measured::FixedCardinalityLabel;
 use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
 use std::fmt::{self, Display};
 
 use crate::auth::IpPattern;
 
 use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
 use crate::proxy::retry::CouldRetry;
+use crate::RoleName;
 
 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
@@ -341,6 +343,26 @@ impl ColdStartInfo {
     }
 }
 
+#[derive(Debug, Deserialize, Clone)]
+pub struct JwksRoleMapping {
+    pub roles: HashMap<RoleName, EndpointJwksResponse>,
+}
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct EndpointJwksResponse {
+    pub jwks: Vec<JwksSettings>,
+}
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct JwksSettings {
+    pub id: String,
+    pub project_id: ProjectIdInt,
+    pub branch_id: BranchIdInt,
+    pub jwks_url: url::Url,
+    pub provider_name: String,
+    pub jwt_audience: Option<String>,
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index cc2ee10062..4794527410 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -305,6 +305,7 @@ impl NodeInfo {
         match keys {
             ComputeCredentialKeys::Password(password) => self.config.password(password),
             ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
+            ComputeCredentialKeys::None => &mut self.config,
         };
     }
 }
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index f38e43ba5a..e1a54a9c98 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -61,7 +61,7 @@ pub trait ComputeConnectBackend {
         ctx: &RequestMonitoring,
     ) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
 
-    fn get_keys(&self) -> Option<&ComputeCredentialKeys>;
+    fn get_keys(&self) -> &ComputeCredentialKeys;
 }
 
 pub struct TcpMechanism<'a> {
@@ -112,9 +112,8 @@ where
     let mut num_retries = 0;
     let mut node_info =
         wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?;
-    if let Some(keys) = user_info.get_keys() {
-        node_info.set_keys(keys);
-    }
+
+    node_info.set_keys(user_info.get_keys());
     node_info.allow_self_signed_compute = allow_self_signed_compute;
     // let mut node_info = credentials.get_node_info(ctx, user_info).await?;
     mechanism.update_connect_config(&mut node_info.config);
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index b2bf93dc6d..ea65867293 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -407,7 +407,7 @@ async fn request_handler(
             .header("Access-Control-Allow-Origin", "*")
             .header(
                 "Access-Control-Allow-Headers",
-                "Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In, Neon-Batch-Read-Only, Neon-Batch-Isolation-Level",
+                "Authorization, Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In, Neon-Batch-Read-Only, Neon-Batch-Isolation-Level",
             )
             .header("Access-Control-Max-Age", "86400" /* 24 hours */)
             .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 295ea1a1c7..b44ecb76e3 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -4,7 +4,10 @@ use async_trait::async_trait;
 use tracing::{field::display, info};
 
 use crate::{
-    auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError},
+    auth::{
+        backend::{local::StaticAuthRules, ComputeCredentials, ComputeUserInfo},
+        check_peer_addr_is_in_list, AuthError,
+    },
     compute,
     config::{AuthenticationConfig, ProxyConfig},
     console::{
@@ -24,7 +27,7 @@ use crate::{
     Host,
 };
 
-use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
+use super::conn_pool::{poll_client, AuthData, Client, ConnInfo, GlobalConnPool};
 
 pub struct PoolingBackend {
     pub pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
@@ -33,13 +36,14 @@ pub struct PoolingBackend {
 }
 
 impl PoolingBackend {
-    pub async fn authenticate(
+    pub async fn authenticate_with_password(
         &self,
         ctx: &RequestMonitoring,
         config: &AuthenticationConfig,
-        conn_info: &ConnInfo,
+        user_info: &ComputeUserInfo,
+        password: &[u8],
     ) -> Result<ComputeCredentials, AuthError> {
-        let user_info = conn_info.user_info.clone();
+        let user_info = user_info.clone();
         let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone());
         let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
         if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
@@ -47,7 +51,7 @@ impl PoolingBackend {
         }
         if !self
             .endpoint_rate_limiter
-            .check(conn_info.user_info.endpoint.clone().into(), 1)
+            .check(user_info.endpoint.clone().into(), 1)
         {
             return Err(AuthError::too_many_connections());
         }
@@ -70,14 +74,10 @@ impl PoolingBackend {
                 return Err(AuthError::auth_failed(&*user_info.user));
             }
         };
-        let ep = EndpointIdInt::from(&conn_info.user_info.endpoint);
-        let auth_outcome = crate::auth::validate_password_and_exchange(
-            &config.thread_pool,
-            ep,
-            &conn_info.password,
-            secret,
-        )
-        .await?;
+        let ep = EndpointIdInt::from(&user_info.endpoint);
+        let auth_outcome =
+            crate::auth::validate_password_and_exchange(&config.thread_pool, ep, password, secret)
+                .await?;
         let res = match auth_outcome {
             crate::sasl::Outcome::Success(key) => {
                 info!("user successfully authenticated");
@@ -85,7 +85,7 @@ impl PoolingBackend {
             }
             crate::sasl::Outcome::Failure(reason) => {
                 info!("auth backend failed with an error: {reason}");
-                Err(AuthError::auth_failed(&*conn_info.user_info.user))
+                Err(AuthError::auth_failed(&*user_info.user))
             }
         };
         res.map(|key| ComputeCredentials {
@@ -94,6 +94,39 @@ impl PoolingBackend {
         })
     }
 
+    pub async fn authenticate_with_jwt(
+        &self,
+        ctx: &RequestMonitoring,
+        user_info: &ComputeUserInfo,
+        jwt: &str,
+    ) -> Result<ComputeCredentials, AuthError> {
+        match &self.config.auth_backend {
+            crate::auth::BackendType::Console(_, _) => {
+                Err(AuthError::auth_failed("JWT login is not yet supported"))
+            }
+            crate::auth::BackendType::Link(_, _) => Err(AuthError::auth_failed(
+                "JWT login over link proxy is not supported",
+            )),
+            crate::auth::BackendType::Local(cache) => {
+                cache
+                    .jwks_cache
+                    .check_jwt(
+                        ctx,
+                        user_info.endpoint.clone(),
+                        user_info.user.clone(),
+                        &StaticAuthRules,
+                        jwt,
+                    )
+                    .await
+                    .map_err(|e| AuthError::auth_failed(e.to_string()))?;
+                Ok(ComputeCredentials {
+                    info: user_info.clone(),
+                    keys: crate::auth::backend::ComputeCredentialKeys::None,
+                })
+            }
+        }
+    }
+
     // Wake up the destination if needed. Code here is a bit involved because
     // we reuse the code from the usual proxy and we need to prepare few structures
     // that this code expects.
@@ -232,10 +265,16 @@ impl ConnectMechanism for TokioMechanism {
         let mut config = (*node_info.config).clone();
         let config = config
             .user(&self.conn_info.user_info.user)
-            .password(&*self.conn_info.password)
             .dbname(&self.conn_info.dbname)
             .connect_timeout(timeout);
 
+        match &self.conn_info.auth {
+            AuthData::Jwt(_) => {}
+            AuthData::Password(pw) => {
+                config.password(pw);
+            }
+        }
+
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let res = config.connect(tokio_postgres::NoTls).await;
         drop(pause);
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 3478787995..6ed694af58 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -33,7 +33,13 @@ use super::backend::HttpConnError;
 pub struct ConnInfo {
     pub user_info: ComputeUserInfo,
     pub dbname: DbName,
-    pub password: SmallVec<[u8; 16]>,
+    pub auth: AuthData,
+}
+
+#[derive(Debug, Clone)]
+pub enum AuthData {
+    Password(SmallVec<[u8; 16]>),
+    Jwt(String),
 }
 
 impl ConnInfo {
@@ -778,7 +784,7 @@ mod tests {
                 options: Default::default(),
             },
             dbname: "dbname".into(),
-            password: "password".as_bytes().into(),
+            auth: AuthData::Password("password".as_bytes().into()),
         };
         let ep_pool = Arc::downgrade(
             &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
@@ -836,7 +842,7 @@ mod tests {
                 options: Default::default(),
             },
             dbname: "dbname".into(),
-            password: "password".as_bytes().into(),
+            auth: AuthData::Password("password".as_bytes().into()),
         };
         let ep_pool = Arc::downgrade(
             &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index bbfed90f39..79baef45f6 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -7,6 +7,7 @@ use futures::future::try_join;
 use futures::future::Either;
 use futures::StreamExt;
 use futures::TryFutureExt;
+use http::header::AUTHORIZATION;
 use http_body_util::BodyExt;
 use http_body_util::Full;
 use hyper1::body::Body;
@@ -56,6 +57,7 @@ use crate::DbName;
 use crate::RoleName;
 
 use super::backend::PoolingBackend;
+use super::conn_pool::AuthData;
 use super::conn_pool::Client;
 use super::conn_pool::ConnInfo;
 use super::http_util::json_response;
@@ -88,6 +90,7 @@ enum Payload {
 const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB
 const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB
 
+static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string");
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
 static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
@@ -109,7 +112,7 @@ where
 #[derive(Debug, thiserror::Error)]
 pub enum ConnInfoError {
     #[error("invalid header: {0}")]
-    InvalidHeader(&'static str),
+    InvalidHeader(&'static HeaderName),
     #[error("invalid connection string: {0}")]
     UrlParseError(#[from] url::ParseError),
     #[error("incorrect scheme")]
@@ -153,10 +156,10 @@ fn get_conn_info(
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     let connection_string = headers
-        .get("Neon-Connection-String")
-        .ok_or(ConnInfoError::InvalidHeader("Neon-Connection-String"))?
+        .get(&CONN_STRING)
+        .ok_or(ConnInfoError::InvalidHeader(&CONN_STRING))?
         .to_str()
-        .map_err(|_| ConnInfoError::InvalidHeader("Neon-Connection-String"))?;
+        .map_err(|_| ConnInfoError::InvalidHeader(&CONN_STRING))?;
 
     let connection_url = Url::parse(connection_string)?;
 
@@ -179,10 +182,23 @@ fn get_conn_info(
     }
     ctx.set_user(username.clone());
 
-    let password = connection_url
-        .password()
-        .ok_or(ConnInfoError::MissingPassword)?;
-    let password = urlencoding::decode_binary(password.as_bytes());
+    let auth = if let Some(auth) = headers.get(&AUTHORIZATION) {
+        let auth = auth
+            .to_str()
+            .map_err(|_| ConnInfoError::InvalidHeader(&AUTHORIZATION))?;
+        AuthData::Jwt(
+            auth.strip_prefix("Bearer ")
+                .ok_or(ConnInfoError::MissingPassword)?
+                .into(),
+        )
+    } else if let Some(pass) = connection_url.password() {
+        AuthData::Password(match urlencoding::decode_binary(pass.as_bytes()) {
+            std::borrow::Cow::Borrowed(b) => b.into(),
+            std::borrow::Cow::Owned(b) => b.into(),
+        })
+    } else {
+        return Err(ConnInfoError::MissingPassword);
+    };
 
     let endpoint = match connection_url.host() {
         Some(url::Host::Domain(hostname)) => {
@@ -225,10 +241,7 @@ fn get_conn_info(
     Ok(ConnInfo {
         user_info,
         dbname,
-        password: match password {
-            std::borrow::Cow::Borrowed(b) => b.into(),
-            std::borrow::Cow::Owned(b) => b.into(),
-        },
+        auth,
     })
 }
 
@@ -550,9 +563,24 @@ async fn handle_inner(
 
     let authenticate_and_connect = Box::pin(
         async {
-            let keys = backend
-                .authenticate(ctx, &config.authentication_config, &conn_info)
-                .await?;
+            let keys = match &conn_info.auth {
+                AuthData::Password(pw) => {
+                    backend
+                        .authenticate_with_password(
+                            ctx,
+                            &config.authentication_config,
+                            &conn_info.user_info,
+                            pw,
+                        )
+                        .await?
+                }
+                AuthData::Jwt(jwt) => {
+                    backend
+                        .authenticate_with_jwt(ctx, &conn_info.user_info, jwt)
+                        .await?
+                }
+            };
+
             let client = backend
                 .connect_to_compute(ctx, conn_info, keys, !allow_pool)
                 .await?;

From 06795c6b9a6b4664dadd4c75ccf9f75087b05614 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 23 Aug 2024 22:32:10 +0100
Subject: [PATCH 54/55] proxy: new local-proxy application (#8736)

Add binary for local-proxy that uses the local auth backend. Runs only
the http serverless driver support and offers config reload based on a
config file and SIGHUP
---
 proxy/src/bin/local_proxy.rs   | 316 +++++++++++++++++++++++++++++++++
 proxy/src/bin/pg_sni_router.rs |   4 +-
 proxy/src/bin/proxy.rs         |   7 +-
 proxy/src/lib.rs               |  14 +-
 4 files changed, 335 insertions(+), 6 deletions(-)
 create mode 100644 proxy/src/bin/local_proxy.rs

diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
new file mode 100644
index 0000000000..8acba33bac
--- /dev/null
+++ b/proxy/src/bin/local_proxy.rs
@@ -0,0 +1,316 @@
+use std::{
+    net::SocketAddr,
+    path::{Path, PathBuf},
+    pin::pin,
+    sync::Arc,
+    time::Duration,
+};
+
+use anyhow::{bail, ensure};
+use dashmap::DashMap;
+use futures::{future::Either, FutureExt};
+use proxy::{
+    auth::backend::local::{JwksRoleSettings, LocalBackend, JWKS_ROLE_MAP},
+    cancellation::CancellationHandlerMain,
+    config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig},
+    console::{locks::ApiLocks, messages::JwksRoleMapping},
+    http::health_server::AppMetrics,
+    metrics::{Metrics, ThreadPoolMetrics},
+    rate_limiter::{BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo},
+    scram::threadpool::ThreadPool,
+    serverless::{self, cancel_set::CancelSet, GlobalConnPoolOptions},
+};
+
+project_git_version!(GIT_VERSION);
+project_build_tag!(BUILD_TAG);
+
+use clap::Parser;
+use tokio::{net::TcpListener, task::JoinSet};
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info, warn};
+use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};
+
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
+/// Neon proxy/router
+#[derive(Parser)]
+#[command(version = GIT_VERSION, about)]
+struct LocalProxyCliArgs {
+    /// listen for incoming metrics connections on ip:port
+    #[clap(long, default_value = "127.0.0.1:7001")]
+    metrics: String,
+    /// listen for incoming http connections on ip:port
+    #[clap(long)]
+    http: String,
+    /// timeout for the TLS handshake
+    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
+    handshake_timeout: tokio::time::Duration,
+    /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
+    #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
+    connect_compute_lock: String,
+    #[clap(flatten)]
+    sql_over_http: SqlOverHttpArgs,
+    /// User rate limiter max number of requests per second.
+    ///
+    /// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
+    /// Can be given multiple times for different bucket sizes.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
+    user_rps_limit: Vec<RateBucketInfo>,
+    /// Whether the auth rate limiter actually takes effect (for testing)
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    auth_rate_limit_enabled: bool,
+    /// Authentication rate limiter max number of hashes per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
+    auth_rate_limit: Vec<RateBucketInfo>,
+    /// The IP subnet to use when considering whether two IP addresses are considered the same.
+    #[clap(long, default_value_t = 64)]
+    auth_rate_limit_ip_subnet: u8,
+    /// Whether to retry the connection to the compute node
+    #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
+    connect_to_compute_retry: String,
+    /// Address of the postgres server
+    #[clap(long, default_value = "127.0.0.1:5432")]
+    compute: SocketAddr,
+    /// File address of the local proxy config file
+    #[clap(long, default_value = "./localproxy.json")]
+    config_path: PathBuf,
+}
+
+#[derive(clap::Args, Clone, Copy, Debug)]
+struct SqlOverHttpArgs {
+    /// How many connections to pool for each endpoint. Excess connections are discarded
+    #[clap(long, default_value_t = 200)]
+    sql_over_http_pool_max_total_conns: usize,
+
+    /// How long pooled connections should remain idle for before closing
+    #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
+    sql_over_http_idle_timeout: tokio::time::Duration,
+
+    #[clap(long, default_value_t = 100)]
+    sql_over_http_client_conn_threshold: u64,
+
+    #[clap(long, default_value_t = 16)]
+    sql_over_http_cancel_set_shards: usize,
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let _logging_guard = proxy::logging::init().await?;
+    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
+
+    Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
+
+    info!("Version: {GIT_VERSION}");
+    info!("Build_tag: {BUILD_TAG}");
+    let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
+        revision: GIT_VERSION,
+        build_tag: BUILD_TAG,
+    });
+
+    let jemalloc = match proxy::jemalloc::MetricRecorder::new() {
+        Ok(t) => Some(t),
+        Err(e) => {
+            tracing::error!(error = ?e, "could not start jemalloc metrics loop");
+            None
+        }
+    };
+
+    let args = LocalProxyCliArgs::parse();
+    let config = build_config(&args)?;
+
+    let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?;
+    let http_listener = TcpListener::bind(args.http).await?;
+    let shutdown = CancellationToken::new();
+
+    // todo: should scale with CU
+    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
+        LeakyBucketConfig {
+            rps: 10.0,
+            max: 100.0,
+        },
+        16,
+    ));
+
+    refresh_config(args.config_path.clone()).await;
+
+    let mut maintenance_tasks = JoinSet::new();
+    maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), move || {
+        refresh_config(args.config_path.clone()).map(Ok)
+    }));
+    maintenance_tasks.spawn(proxy::http::health_server::task_main(
+        metrics_listener,
+        AppMetrics {
+            jemalloc,
+            neon_metrics,
+            proxy: proxy::metrics::Metrics::get(),
+        },
+    ));
+
+    let task = serverless::task_main(
+        config,
+        http_listener,
+        shutdown.clone(),
+        Arc::new(CancellationHandlerMain::new(
+            Arc::new(DashMap::new()),
+            None,
+            proxy::metrics::CancellationSource::Local,
+        )),
+        endpoint_rate_limiter,
+    );
+
+    match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await {
+        // exit immediately on maintenance task completion
+        Either::Left((Some(res), _)) => match proxy::flatten_err(res)? {},
+        // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
+        Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
+        // exit immediately on client task error
+        Either::Right((res, _)) => res?,
+    }
+
+    Ok(())
+}
+
+/// ProxyConfig is created at proxy startup, and lives forever.
+fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
+    let config::ConcurrencyLockOptions {
+        shards,
+        limiter,
+        epoch,
+        timeout,
+    } = args.connect_compute_lock.parse()?;
+    info!(
+        ?limiter,
+        shards,
+        ?epoch,
+        "Using NodeLocks (connect_compute)"
+    );
+    let connect_compute_locks = ApiLocks::new(
+        "connect_compute_lock",
+        limiter,
+        shards,
+        timeout,
+        epoch,
+        &Metrics::get().proxy.connect_compute_lock,
+    )?;
+
+    let http_config = HttpConfig {
+        accept_websockets: false,
+        pool_options: GlobalConnPoolOptions {
+            gc_epoch: Duration::from_secs(60),
+            pool_shards: 2,
+            idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
+            opt_in: false,
+
+            max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_total_conns,
+            max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
+        },
+        cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
+        client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
+    };
+
+    Ok(Box::leak(Box::new(ProxyConfig {
+        tls_config: None,
+        auth_backend: proxy::auth::BackendType::Local(proxy::auth::backend::MaybeOwned::Owned(
+            LocalBackend::new(args.compute),
+        )),
+        metric_collection: None,
+        allow_self_signed_compute: false,
+        http_config,
+        authentication_config: AuthenticationConfig {
+            thread_pool: ThreadPool::new(0),
+            scram_protocol_timeout: Duration::from_secs(10),
+            rate_limiter_enabled: false,
+            rate_limiter: BucketRateLimiter::new(vec![]),
+            rate_limit_ip_subnet: 64,
+        },
+        require_client_ip: false,
+        handshake_timeout: Duration::from_secs(10),
+        region: "local".into(),
+        wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?,
+        connect_compute_locks,
+        connect_to_compute_retry_config: RetryConfig::parse(
+            RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES,
+        )?,
+    })))
+}
+
+async fn refresh_config(path: PathBuf) {
+    match refresh_config_inner(&path).await {
+        Ok(()) => {}
+        Err(e) => {
+            error!(error=?e, ?path, "could not read config file");
+        }
+    }
+}
+
+async fn refresh_config_inner(path: &Path) -> anyhow::Result<()> {
+    let bytes = tokio::fs::read(&path).await?;
+    let mut data: JwksRoleMapping = serde_json::from_slice(&bytes)?;
+
+    let mut settings = None;
+
+    for mapping in data.roles.values_mut() {
+        for jwks in &mut mapping.jwks {
+            ensure!(
+                jwks.jwks_url.has_authority()
+                    && (jwks.jwks_url.scheme() == "http" || jwks.jwks_url.scheme() == "https"),
+                "Invalid JWKS url. Must be HTTP",
+            );
+
+            ensure!(
+                jwks.jwks_url
+                    .host()
+                    .is_some_and(|h| h != url::Host::Domain("")),
+                "Invalid JWKS url. No domain listed",
+            );
+
+            // clear username, password and ports
+            jwks.jwks_url.set_username("").expect(
+                "url can be a base and has a valid host and is not a file. should not error",
+            );
+            jwks.jwks_url.set_password(None).expect(
+                "url can be a base and has a valid host and is not a file. should not error",
+            );
+            // local testing is hard if we need to have a specific restricted port
+            if cfg!(not(feature = "testing")) {
+                jwks.jwks_url.set_port(None).expect(
+                    "url can be a base and has a valid host and is not a file. should not error",
+                );
+            }
+
+            // clear query params
+            jwks.jwks_url.set_fragment(None);
+            jwks.jwks_url.query_pairs_mut().clear().finish();
+
+            if jwks.jwks_url.scheme() != "https" {
+                // local testing is hard if we need to set up https support.
+                if cfg!(not(feature = "testing")) {
+                    jwks.jwks_url
+                        .set_scheme("https")
+                        .expect("should not error to set the scheme to https if it was http");
+                } else {
+                    warn!(scheme = jwks.jwks_url.scheme(), "JWKS url is not HTTPS");
+                }
+            }
+
+            let (pr, br) = settings.get_or_insert((jwks.project_id, jwks.branch_id));
+            ensure!(
+                *pr == jwks.project_id,
+                "inconsistent project IDs configured"
+            );
+            ensure!(*br == jwks.branch_id, "inconsistent branch IDs configured");
+        }
+    }
+
+    if let Some((project_id, branch_id)) = settings {
+        JWKS_ROLE_MAP.store(Some(Arc::new(JwksRoleSettings {
+            roles: data.roles,
+            project_id,
+            branch_id,
+        })));
+    }
+
+    Ok(())
+}
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 1038fa5116..20d2d3df9a 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -133,7 +133,9 @@ async fn main() -> anyhow::Result<()> {
         proxy_listener,
         cancellation_token.clone(),
     ));
-    let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token));
+    let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || async {
+        Ok(())
+    }));
 
     // the signal task cant ever succeed.
     // the main task can error, or can succeed on cancellation.
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index d83a1f3bcf..1f45a33ed5 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -148,7 +148,7 @@ struct ProxyCliArgs {
     disable_dynamic_rate_limiter: bool,
     /// Endpoint rate limiter max number of requests per second.
     ///
-    /// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
+    /// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
     /// Can be given multiple times for different bucket sizes.
     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
     endpoint_rps_limit: Vec<RateBucketInfo>,
@@ -447,7 +447,10 @@ async fn main() -> anyhow::Result<()> {
 
     // maintenance tasks. these never return unless there's an error
     let mut maintenance_tasks = JoinSet::new();
-    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
+    maintenance_tasks.spawn(proxy::handle_signals(
+        cancellation_token.clone(),
+        || async { Ok(()) },
+    ));
     maintenance_tasks.spawn(http::health_server::task_main(
         http_listener,
         AppMetrics {
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index b7d497ebcc..8e1a4e4fa2 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -88,7 +88,7 @@
 // List of temporarily allowed lints to unblock beta/nightly.
 #![allow(unknown_lints, clippy::manual_inspect)]
 
-use std::convert::Infallible;
+use std::{convert::Infallible, future::Future};
 
 use anyhow::{bail, Context};
 use intern::{EndpointIdInt, EndpointIdTag, InternId};
@@ -123,7 +123,14 @@ pub mod usage_metrics;
 pub mod waiters;
 
 /// Handle unix signals appropriately.
-pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<Infallible> {
+pub async fn handle_signals<F, Fut>(
+    token: CancellationToken,
+    mut refresh_config: F,
+) -> anyhow::Result<Infallible>
+where
+    F: FnMut() -> Fut,
+    Fut: Future<Output = anyhow::Result<()>>,
+{
     use tokio::signal::unix::{signal, SignalKind};
 
     let mut hangup = signal(SignalKind::hangup())?;
@@ -134,7 +141,8 @@ pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<Infallib
         tokio::select! {
             // Hangup is commonly used for config reload.
             _ = hangup.recv() => {
-                warn!("received SIGHUP; config reload is not supported");
+                warn!("received SIGHUP");
+                refresh_config().await?;
             }
             // Shut down the whole application.
             _ = interrupt.recv() => {

From cdfdcd3e5d665c6f8093623e7323cef3d58aa308 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Ko=C5=82odziejczak?=
 <31549762+mrl5@users.noreply.github.com>
Date: Sun, 25 Aug 2024 17:33:45 +0200
Subject: [PATCH 55/55] chore: improve markdown formatting (#8825)

fixes:

![Screenshot_2024-08-25_16-25-30](https://github.com/user-attachments/assets/c993309b-6c2d-4938-9fd0-ce0953fc63ff)

fixes:

![Screenshot_2024-08-25_16-26-29](https://github.com/user-attachments/assets/cf497f4a-d9e3-45a6-a1a5-7e215d96d022)
---
 proxy/README.md            | 5 +++--
 storage_scrubber/README.md | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/proxy/README.md b/proxy/README.md
index d1f2e3f27b..afc8b77db8 100644
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -36,7 +36,7 @@ To play with it locally one may start proxy over a local postgres installation
 ```
 
 If both postgres and proxy are running you may send a SQL query:
-```json
+```console
 curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
   -H 'Neon-Connection-String: postgres://stas:pass@proxy.localtest.me:4444/postgres' \
   -H 'Content-Type: application/json' \
@@ -44,7 +44,8 @@ curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
     "query":"SELECT $1::int[] as arr, $2::jsonb as obj, 42 as num",
     "params":[ "{{1,2},{\"3\",4}}", {"key":"val", "ikey":4242}]
   }' | jq
-
+```
+```json
 {
   "command": "SELECT",
   "fields": [
diff --git a/storage_scrubber/README.md b/storage_scrubber/README.md
index 9fbd92feef..5be8541419 100644
--- a/storage_scrubber/README.md
+++ b/storage_scrubber/README.md
@@ -98,7 +98,7 @@ to list timelines and find their backup and start LSNs.
 
 If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers.
 
-First, we need to group pageservers by buckets, `https://<admin host>/admin/pageservers`` can be used for all env nodes, then `cat /storage/pageserver/data/pageserver.toml` on every node will show the bucket names and regions needed.
+First, we need to group pageservers by buckets, `https://<admin host>/admin/pageservers` can be used for all env nodes, then `cat /storage/pageserver/data/pageserver.toml` on every node will show the bucket names and regions needed.
 
 Per bucket, for every pageserver id related, find deleted tenants: