From 131585eb6bd206907a969f8eab44017b282d1556 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 4 Dec 2024 21:07:44 +0000
Subject: [PATCH 01/26] chore: update rust-postgres (#10002)

Like #9931 but without rebasing upstream just yet, to try and minimise
the differences.

Removes all proxy-specific commits from the rust-postgres fork, now that
proxy no longer depends on them. Merging upstream changes to come later.
---
 Cargo.lock | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index de8785f87e..62f06d45bd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4169,7 +4169,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4182,7 +4182,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -4195,7 +4195,6 @@ dependencies = [
  "rand 0.8.5",
  "sha2",
  "stringprep",
- "tokio",
 ]
 
 [[package]]
@@ -4217,7 +4216,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -6547,7 +6546,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
 dependencies = [
  "async-trait",
  "byteorder",

From ed2d89211306ca892dce41159bc1cc8e9e1646a5 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Wed, 4 Dec 2024 21:16:09 -0500
Subject: [PATCH 02/26] pageserver: fix buffered-writer on macos build (#10019)

## Problem

In https://github.com/neondatabase/neon/pull/9693, we forgot to check
macos build. The [CI
run](https://github.com/neondatabase/neon/actions/runs/12164541897/job/33926455468)
on main showed that macos build failed with unused variables and dead
code.

## Summary of changes

- add `allow(dead_code)` and `allow(unused_variables)` to the relevant
code that is not used on macos.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 pageserver/src/tenant/remote_timeline_client/download.rs | 7 ++++---
 pageserver/src/virtual_file/owned_buffers_io/write.rs    | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index c5ae466f3a..d15f161fb6 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -6,7 +6,6 @@
 use std::collections::HashSet;
 use std::future::Future;
 use std::str::FromStr;
-use std::sync::Arc;
 use std::time::SystemTime;
 
 use anyhow::{anyhow, Context};
@@ -27,7 +26,7 @@ use crate::span::{
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
-use crate::virtual_file::{on_fatal_io_error, IoBufferMut, MaybeFatalIo, VirtualFile};
+use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{
     DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath,
@@ -150,7 +149,7 @@ async fn download_object<'a>(
     storage: &'a GenericRemoteStorage,
     src_path: &RemotePath,
     dst_path: &Utf8PathBuf,
-    gate: &utils::sync::gate::Gate,
+    #[cfg_attr(target_os = "macos", allow(unused_variables))] gate: &utils::sync::gate::Gate,
     cancel: &CancellationToken,
     #[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
@@ -209,6 +208,8 @@ async fn download_object<'a>(
         #[cfg(target_os = "linux")]
         crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
             use crate::virtual_file::owned_buffers_io;
+            use crate::virtual_file::IoBufferMut;
+            use std::sync::Arc;
             async {
                 let destination_file = Arc::new(
                     VirtualFile::create(dst_path, ctx)
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index 20bf878123..7299d83703 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -132,6 +132,7 @@ where
             .expect("must not use after we returned an error")
     }
 
+    #[cfg_attr(target_os = "macos", allow(dead_code))]
     pub async fn write_buffered_borrowed(
         &mut self,
         chunk: &[u8],

From ffc9c33eb2383f9970a246bce8712772c7696080 Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Thu, 5 Dec 2024 07:30:38 +0200
Subject: [PATCH 03/26] proxy: Present new auth backend cplane_proxy_v1
 (#10012)

Implement a new auth backend based on the current Neon backend to switch
to the new Proxy V1 cplane API.

Implements [#21048](https://github.com/neondatabase/cloud/issues/21048)
---
 proxy/src/auth/backend/mod.rs                 |   4 +
 proxy/src/bin/proxy.rs                        |  99 +++-
 .../control_plane/client/cplane_proxy_v1.rs   | 514 ++++++++++++++++++
 proxy/src/control_plane/client/mod.rs         |   7 +
 proxy/src/control_plane/client/neon.rs        |   2 +-
 proxy/src/control_plane/messages.rs           |  10 +
 6 files changed, 634 insertions(+), 2 deletions(-)
 create mode 100644 proxy/src/control_plane/client/cplane_proxy_v1.rs

diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 84a572dcf9..1bad7b3086 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -70,6 +70,10 @@ impl std::fmt::Display for Backend<'_, ()> {
     fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
             Self::ControlPlane(api, ()) => match &**api {
+                ControlPlaneClient::ProxyV1(endpoint) => fmt
+                    .debug_tuple("ControlPlane::ProxyV1")
+                    .field(&endpoint.url())
+                    .finish(),
                 ControlPlaneClient::Neon(endpoint) => fmt
                     .debug_tuple("ControlPlane::Neon")
                     .field(&endpoint.url())
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index c929b97d78..99144acef0 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -46,6 +46,9 @@ enum AuthBackendType {
     #[value(name("console"), alias("cplane"))]
     ControlPlane,
 
+    #[value(name("cplane-v1"), alias("control-plane"))]
+    ControlPlaneV1,
+
     #[value(name("link"), alias("control-redirect"))]
     ConsoleRedirect,
 
@@ -518,6 +521,39 @@ async fn main() -> anyhow::Result<()> {
                         .instrument(span),
                 );
             }
+        } else if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
+            match (redis_notifications_client, regional_redis_client.clone()) {
+                (None, None) => {}
+                (client1, client2) => {
+                    let cache = api.caches.project_info.clone();
+                    if let Some(client) = client1 {
+                        maintenance_tasks.spawn(notifications::task_main(
+                            client,
+                            cache.clone(),
+                            cancel_map.clone(),
+                            args.region.clone(),
+                        ));
+                    }
+                    if let Some(client) = client2 {
+                        maintenance_tasks.spawn(notifications::task_main(
+                            client,
+                            cache.clone(),
+                            cancel_map.clone(),
+                            args.region.clone(),
+                        ));
+                    }
+                    maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
+                }
+            }
+            if let Some(regional_redis_client) = regional_redis_client {
+                let cache = api.caches.endpoints_cache.clone();
+                let con = regional_redis_client;
+                let span = tracing::info_span!("endpoints_cache");
+                maintenance_tasks.spawn(
+                    async move { cache.do_read(con, cancellation_token.clone()).await }
+                        .instrument(span),
+                );
+            }
         }
     }
 
@@ -662,6 +698,65 @@ fn build_auth_backend(
     args: &ProxyCliArgs,
 ) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> {
     match &args.auth_backend {
+        AuthBackendType::ControlPlaneV1 => {
+            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
+            let project_info_cache_config: ProjectInfoCacheOptions =
+                args.project_info_cache.parse()?;
+            let endpoint_cache_config: config::EndpointCacheConfig =
+                args.endpoint_cache_config.parse()?;
+
+            info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
+            info!(
+                "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
+            );
+            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
+            let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
+                wake_compute_cache_config,
+                project_info_cache_config,
+                endpoint_cache_config,
+            )));
+
+            let config::ConcurrencyLockOptions {
+                shards,
+                limiter,
+                epoch,
+                timeout,
+            } = args.wake_compute_lock.parse()?;
+            info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
+            let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
+                "wake_compute_lock",
+                limiter,
+                shards,
+                timeout,
+                epoch,
+                &Metrics::get().wake_compute_lock,
+            )?));
+            tokio::spawn(locks.garbage_collect_worker());
+
+            let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
+
+            let endpoint = http::Endpoint::new(url, http::new_client());
+
+            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
+            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
+            let wake_compute_endpoint_rate_limiter =
+                Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
+
+            let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new(
+                endpoint,
+                args.control_plane_token.clone(),
+                caches,
+                locks,
+                wake_compute_endpoint_rate_limiter,
+            );
+
+            let api = control_plane::client::ControlPlaneClient::ProxyV1(api);
+            let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
+            let config = Box::leak(Box::new(auth_backend));
+
+            Ok(Either::Left(config))
+        }
+
         AuthBackendType::ControlPlane => {
             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
             let project_info_cache_config: ProjectInfoCacheOptions =
@@ -697,13 +792,15 @@ fn build_auth_backend(
             )?));
             tokio::spawn(locks.garbage_collect_worker());
 
-            let url = args.auth_endpoint.parse()?;
+            let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
+
             let endpoint = http::Endpoint::new(url, http::new_client());
 
             let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
             RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
             let wake_compute_endpoint_rate_limiter =
                 Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
+
             let api = control_plane::client::neon::NeonControlPlaneClient::new(
                 endpoint,
                 args.control_plane_token.clone(),
diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs
new file mode 100644
index 0000000000..e33a37f643
--- /dev/null
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -0,0 +1,514 @@
+//! Production console backend.
+
+use std::sync::Arc;
+use std::time::Duration;
+
+use ::http::header::AUTHORIZATION;
+use ::http::HeaderName;
+use futures::TryFutureExt;
+use postgres_client::config::SslMode;
+use tokio::time::Instant;
+use tracing::{debug, info, info_span, warn, Instrument};
+
+use super::super::messages::{ControlPlaneErrorMessage, GetEndpointAccessControl, WakeCompute};
+use crate::auth::backend::jwt::AuthRule;
+use crate::auth::backend::ComputeUserInfo;
+use crate::cache::Cached;
+use crate::context::RequestContext;
+use crate::control_plane::caches::ApiCaches;
+use crate::control_plane::errors::{
+    ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
+};
+use crate::control_plane::locks::ApiLocks;
+use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
+use crate::control_plane::{
+    AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo,
+};
+use crate::metrics::{CacheOutcome, Metrics};
+use crate::rate_limiter::WakeComputeRateLimiter;
+use crate::types::{EndpointCacheKey, EndpointId};
+use crate::{compute, http, scram};
+
+const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
+
+#[derive(Clone)]
+pub struct NeonControlPlaneClient {
+    endpoint: http::Endpoint,
+    pub caches: &'static ApiCaches,
+    pub(crate) locks: &'static ApiLocks<EndpointCacheKey>,
+    pub(crate) wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
+    // put in a shared ref so we don't copy secrets all over in memory
+    jwt: Arc<str>,
+}
+
+impl NeonControlPlaneClient {
+    /// Construct an API object containing the auth parameters.
+    pub fn new(
+        endpoint: http::Endpoint,
+        jwt: Arc<str>,
+        caches: &'static ApiCaches,
+        locks: &'static ApiLocks<EndpointCacheKey>,
+        wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
+    ) -> Self {
+        Self {
+            endpoint,
+            caches,
+            locks,
+            wake_compute_endpoint_rate_limiter,
+            jwt,
+        }
+    }
+
+    pub(crate) fn url(&self) -> &str {
+        self.endpoint.url().as_str()
+    }
+
+    async fn do_get_auth_info(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<AuthInfo, GetAuthInfoError> {
+        if !self
+            .caches
+            .endpoints_cache
+            .is_valid(ctx, &user_info.endpoint.normalize())
+        {
+            // TODO: refactor this because it's weird
+            // this is a failure to authenticate but we return Ok.
+            info!("endpoint is not valid, skipping the request");
+            return Ok(AuthInfo::default());
+        }
+        let request_id = ctx.session_id().to_string();
+        let application_name = ctx.console_application_name();
+        async {
+            let request = self
+                .endpoint
+                .get_path("get_endpoint_access_control")
+                .header(X_REQUEST_ID, &request_id)
+                .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
+                .query(&[("session_id", ctx.session_id())])
+                .query(&[
+                    ("application_name", application_name.as_str()),
+                    ("endpointish", user_info.endpoint.as_str()),
+                    ("role", user_info.user.as_str()),
+                ])
+                .build()?;
+
+            debug!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
+            let response = self.endpoint.execute(request).await?;
+            drop(pause);
+            info!(duration = ?start.elapsed(), "received http response");
+            let body = match parse_body::<GetEndpointAccessControl>(response).await {
+                Ok(body) => body,
+                // Error 404 is special: it's ok not to have a secret.
+                // TODO(anna): retry
+                Err(e) => {
+                    return if e.get_reason().is_not_found() {
+                        // TODO: refactor this because it's weird
+                        // this is a failure to authenticate but we return Ok.
+                        Ok(AuthInfo::default())
+                    } else {
+                        Err(e.into())
+                    };
+                }
+            };
+
+            // Ivan: don't know where it will be used, so I leave it here
+            let _endpoint_vpc_ids = body.allowed_vpc_endpoint_ids.unwrap_or_default();
+
+            let secret = if body.role_secret.is_empty() {
+                None
+            } else {
+                let secret = scram::ServerSecret::parse(&body.role_secret)
+                    .map(AuthSecret::Scram)
+                    .ok_or(GetAuthInfoError::BadSecret)?;
+                Some(secret)
+            };
+            let allowed_ips = body.allowed_ips.unwrap_or_default();
+            Metrics::get()
+                .proxy
+                .allowed_ips_number
+                .observe(allowed_ips.len() as f64);
+            Ok(AuthInfo {
+                secret,
+                allowed_ips,
+                project_id: body.project_id,
+            })
+        }
+        .inspect_err(|e| tracing::debug!(error = ?e))
+        .instrument(info_span!("do_get_auth_info"))
+        .await
+    }
+
+    async fn do_get_endpoint_jwks(
+        &self,
+        ctx: &RequestContext,
+        endpoint: EndpointId,
+    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
+        if !self
+            .caches
+            .endpoints_cache
+            .is_valid(ctx, &endpoint.normalize())
+        {
+            return Err(GetEndpointJwksError::EndpointNotFound);
+        }
+        let request_id = ctx.session_id().to_string();
+        async {
+            let request = self
+                .endpoint
+                .get_with_url(|url| {
+                    url.path_segments_mut()
+                        .push("endpoints")
+                        .push(endpoint.as_str())
+                        .push("jwks");
+                })
+                .header(X_REQUEST_ID, &request_id)
+                .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
+                .query(&[("session_id", ctx.session_id())])
+                .build()
+                .map_err(GetEndpointJwksError::RequestBuild)?;
+
+            debug!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
+            let response = self
+                .endpoint
+                .execute(request)
+                .await
+                .map_err(GetEndpointJwksError::RequestExecute)?;
+            drop(pause);
+            info!(duration = ?start.elapsed(), "received http response");
+
+            let body = parse_body::<EndpointJwksResponse>(response).await?;
+
+            let rules = body
+                .jwks
+                .into_iter()
+                .map(|jwks| AuthRule {
+                    id: jwks.id,
+                    jwks_url: jwks.jwks_url,
+                    audience: jwks.jwt_audience,
+                    role_names: jwks.role_names,
+                })
+                .collect();
+
+            Ok(rules)
+        }
+        .inspect_err(|e| tracing::debug!(error = ?e))
+        .instrument(info_span!("do_get_endpoint_jwks"))
+        .await
+    }
+
+    async fn do_wake_compute(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<NodeInfo, WakeComputeError> {
+        let request_id = ctx.session_id().to_string();
+        let application_name = ctx.console_application_name();
+        async {
+            let mut request_builder = self
+                .endpoint
+                .get_path("wake_compute")
+                .header("X-Request-ID", &request_id)
+                .header("Authorization", format!("Bearer {}", &self.jwt))
+                .query(&[("session_id", ctx.session_id())])
+                .query(&[
+                    ("application_name", application_name.as_str()),
+                    ("endpointish", user_info.endpoint.as_str()),
+                ]);
+
+            let options = user_info.options.to_deep_object();
+            if !options.is_empty() {
+                request_builder = request_builder.query(&options);
+            }
+
+            let request = request_builder.build()?;
+
+            debug!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
+            let response = self.endpoint.execute(request).await?;
+            drop(pause);
+            info!(duration = ?start.elapsed(), "received http response");
+            let body = parse_body::<WakeCompute>(response).await?;
+
+            // Unfortunately, ownership won't let us use `Option::ok_or` here.
+            let (host, port) = match parse_host_port(&body.address) {
+                None => return Err(WakeComputeError::BadComputeAddress(body.address)),
+                Some(x) => x,
+            };
+
+            // Don't set anything but host and port! This config will be cached.
+            // We'll set username and such later using the startup message.
+            // TODO: add more type safety (in progress).
+            let mut config = compute::ConnCfg::new(host.to_owned(), port);
+            config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
+
+            let node = NodeInfo {
+                config,
+                aux: body.aux,
+                allow_self_signed_compute: false,
+            };
+
+            Ok(node)
+        }
+        .inspect_err(|e| tracing::debug!(error = ?e))
+        .instrument(info_span!("do_wake_compute"))
+        .await
+    }
+}
+
+impl super::ControlPlaneApi for NeonControlPlaneClient {
+    #[tracing::instrument(skip_all)]
+    async fn get_role_secret(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
+        let normalized_ep = &user_info.endpoint.normalize();
+        let user = &user_info.user;
+        if let Some(role_secret) = self
+            .caches
+            .project_info
+            .get_role_secret(normalized_ep, user)
+        {
+            return Ok(role_secret);
+        }
+        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        if let Some(project_id) = auth_info.project_id {
+            let normalized_ep_int = normalized_ep.into();
+            self.caches.project_info.insert_role_secret(
+                project_id,
+                normalized_ep_int,
+                user.into(),
+                auth_info.secret.clone(),
+            );
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                Arc::new(auth_info.allowed_ips),
+            );
+            ctx.set_project_id(project_id);
+        }
+        // When we just got a secret, we don't need to invalidate it.
+        Ok(Cached::new_uncached(auth_info.secret))
+    }
+
+    async fn get_allowed_ips_and_secret(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
+        let normalized_ep = &user_info.endpoint.normalize();
+        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
+            Metrics::get()
+                .proxy
+                .allowed_ips_cache_misses
+                .inc(CacheOutcome::Hit);
+            return Ok((allowed_ips, None));
+        }
+        Metrics::get()
+            .proxy
+            .allowed_ips_cache_misses
+            .inc(CacheOutcome::Miss);
+        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let user = &user_info.user;
+        if let Some(project_id) = auth_info.project_id {
+            let normalized_ep_int = normalized_ep.into();
+            self.caches.project_info.insert_role_secret(
+                project_id,
+                normalized_ep_int,
+                user.into(),
+                auth_info.secret.clone(),
+            );
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                allowed_ips.clone(),
+            );
+            ctx.set_project_id(project_id);
+        }
+        Ok((
+            Cached::new_uncached(allowed_ips),
+            Some(Cached::new_uncached(auth_info.secret)),
+        ))
+    }
+
+    #[tracing::instrument(skip_all)]
+    async fn get_endpoint_jwks(
+        &self,
+        ctx: &RequestContext,
+        endpoint: EndpointId,
+    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
+        self.do_get_endpoint_jwks(ctx, endpoint).await
+    }
+
+    #[tracing::instrument(skip_all)]
+    async fn wake_compute(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedNodeInfo, WakeComputeError> {
+        let key = user_info.endpoint_cache_key();
+
+        macro_rules! check_cache {
+            () => {
+                if let Some(cached) = self.caches.node_info.get(&key) {
+                    let (cached, info) = cached.take_value();
+                    let info = info.map_err(|c| {
+                        info!(key = &*key, "found cached wake_compute error");
+                        WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c)))
+                    })?;
+
+                    debug!(key = &*key, "found cached compute node info");
+                    ctx.set_project(info.aux.clone());
+                    return Ok(cached.map(|()| info));
+                }
+            };
+        }
+
+        // Every time we do a wakeup http request, the compute node will stay up
+        // for some time (highly depends on the console's scale-to-zero policy);
+        // The connection info remains the same during that period of time,
+        // which means that we might cache it to reduce the load and latency.
+        check_cache!();
+
+        let permit = self.locks.get_permit(&key).await?;
+
+        // after getting back a permit - it's possible the cache was filled
+        // double check
+        if permit.should_check_cache() {
+            // TODO: if there is something in the cache, mark the permit as success.
+            check_cache!();
+        }
+
+        // check rate limit
+        if !self
+            .wake_compute_endpoint_rate_limiter
+            .check(user_info.endpoint.normalize_intern(), 1)
+        {
+            return Err(WakeComputeError::TooManyConnections);
+        }
+
+        let node = permit.release_result(self.do_wake_compute(ctx, user_info).await);
+        match node {
+            Ok(node) => {
+                ctx.set_project(node.aux.clone());
+                debug!(key = &*key, "created a cache entry for woken compute node");
+
+                let mut stored_node = node.clone();
+                // store the cached node as 'warm_cached'
+                stored_node.aux.cold_start_info = ColdStartInfo::WarmCached;
+
+                let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node));
+
+                Ok(cached.map(|()| node))
+            }
+            Err(err) => match err {
+                WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => {
+                    let Some(status) = &err.status else {
+                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
+                            err,
+                        )));
+                    };
+
+                    let reason = status
+                        .details
+                        .error_info
+                        .map_or(Reason::Unknown, |x| x.reason);
+
+                    // if we can retry this error, do not cache it.
+                    if reason.can_retry() {
+                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
+                            err,
+                        )));
+                    }
+
+                    // at this point, we should only have quota errors.
+                    debug!(
+                        key = &*key,
+                        "created a cache entry for the wake compute error"
+                    );
+
+                    self.caches.node_info.insert_ttl(
+                        key,
+                        Err(err.clone()),
+                        Duration::from_secs(30),
+                    );
+
+                    Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
+                        err,
+                    )))
+                }
+                err => return Err(err),
+            },
+        }
+    }
+}
+
+/// Parse http response body, taking status code into account.
+async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
+    response: http::Response,
+) -> Result<T, ControlPlaneError> {
+    let status = response.status();
+    if status.is_success() {
+        // We shouldn't log raw body because it may contain secrets.
+        info!("request succeeded, processing the body");
+        return Ok(response.json().await?);
+    }
+    let s = response.bytes().await?;
+    // Log plaintext to be able to detect, whether there are some cases not covered by the error struct.
+    info!("response_error plaintext: {:?}", s);
+
+    // Don't throw an error here because it's not as important
+    // as the fact that the request itself has failed.
+    let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| {
+        warn!("failed to parse error body: {e}");
+        ControlPlaneErrorMessage {
+            error: "reason unclear (malformed error message)".into(),
+            http_status_code: status,
+            status: None,
+        }
+    });
+    body.http_status_code = status;
+
+    warn!("console responded with an error ({status}): {body:?}");
+    Err(ControlPlaneError::Message(Box::new(body)))
+}
+
+fn parse_host_port(input: &str) -> Option<(&str, u16)> {
+    let (host, port) = input.rsplit_once(':')?;
+    let ipv6_brackets: &[_] = &['[', ']'];
+    Some((host.trim_matches(ipv6_brackets), port.parse().ok()?))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_host_port_v4() {
+        let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse");
+        assert_eq!(host, "127.0.0.1");
+        assert_eq!(port, 5432);
+    }
+
+    #[test]
+    fn test_parse_host_port_v6() {
+        let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse");
+        assert_eq!(host, "2001:db8::1");
+        assert_eq!(port, 5432);
+    }
+
+    #[test]
+    fn test_parse_host_port_url() {
+        let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432")
+            .expect("failed to parse");
+        assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local");
+        assert_eq!(port, 5432);
+    }
+}
diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs
index f8f74372f0..7ef5a9c9fd 100644
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -1,3 +1,4 @@
+pub mod cplane_proxy_v1;
 #[cfg(any(test, feature = "testing"))]
 pub mod mock;
 pub mod neon;
@@ -27,6 +28,8 @@ use crate::types::EndpointId;
 #[non_exhaustive]
 #[derive(Clone)]
 pub enum ControlPlaneClient {
+    /// New Proxy V1 control plane API
+    ProxyV1(cplane_proxy_v1::NeonControlPlaneClient),
     /// Current Management API (V2).
     Neon(neon::NeonControlPlaneClient),
     /// Local mock control plane.
@@ -45,6 +48,7 @@ impl ControlPlaneApi for ControlPlaneClient {
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
         match self {
+            Self::ProxyV1(api) => api.get_role_secret(ctx, user_info).await,
             Self::Neon(api) => api.get_role_secret(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
             Self::PostgresMock(api) => api.get_role_secret(ctx, user_info).await,
@@ -61,6 +65,7 @@ impl ControlPlaneApi for ControlPlaneClient {
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
         match self {
+            Self::ProxyV1(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             Self::Neon(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
             Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
@@ -75,6 +80,7 @@ impl ControlPlaneApi for ControlPlaneClient {
         endpoint: EndpointId,
     ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
         match self {
+            Self::ProxyV1(api) => api.get_endpoint_jwks(ctx, endpoint).await,
             Self::Neon(api) => api.get_endpoint_jwks(ctx, endpoint).await,
             #[cfg(any(test, feature = "testing"))]
             Self::PostgresMock(api) => api.get_endpoint_jwks(ctx, endpoint).await,
@@ -89,6 +95,7 @@ impl ControlPlaneApi for ControlPlaneClient {
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
         match self {
+            Self::ProxyV1(api) => api.wake_compute(ctx, user_info).await,
             Self::Neon(api) => api.wake_compute(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
             Self::PostgresMock(api) => api.wake_compute(ctx, user_info).await,
diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs
index 5c204ae1d7..bf62c0d6ab 100644
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -1,4 +1,4 @@
-//! Production console backend.
+//! Stale console backend, remove after migrating to Proxy V1 API (#15245).
 
 use std::sync::Arc;
 use std::time::Duration;
diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index 8762ba874b..2662ab85f9 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -230,6 +230,16 @@ pub(crate) struct GetRoleSecret {
     pub(crate) project_id: Option<ProjectIdInt>,
 }
 
+/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
+/// Returned by the `/get_endpoint_access_control` API method.
+#[derive(Deserialize)]
+pub(crate) struct GetEndpointAccessControl {
+    pub(crate) role_secret: Box<str>,
+    pub(crate) allowed_ips: Option<Vec<IpPattern>>,
+    pub(crate) project_id: Option<ProjectIdInt>,
+    pub(crate) allowed_vpc_endpoint_ids: Option<Vec<EndpointIdInt>>,
+}
+
 // Manually implement debug to omit sensitive info.
 impl fmt::Debug for GetRoleSecret {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {

From db793044167d1b81cea7f2a2a57a189711d0d683 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 5 Dec 2024 18:29:21 +0100
Subject: [PATCH 04/26] storage_controller: increase shard scan timeout
 (#10000)

## Problem

The node shard scan timeout of 1 second is a bit too aggressive, and
we've seen this cause test failures. The scans are performed in parallel
across nodes, and the entire operation has a 15 second timeout.

Resolves #9801.

## Summary of changes

Increase the timeout to 5 seconds. This is still enough to time out on a
network failure and retry successfully within 15 seconds.
---
 storage_controller/src/service.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 92ec58cb4d..083c78233a 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -789,7 +789,7 @@ impl Service {
             node_list_futs.push({
                 async move {
                     tracing::info!("Scanning shards on node {node}...");
-                    let timeout = Duration::from_secs(1);
+                    let timeout = Duration::from_secs(5);
                     let response = node
                         .with_client_retries(
                             |client| async move { client.list_location_config().await },

From 13e810574029953ab4f5002724ad853fc2c39922 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 5 Dec 2024 18:57:25 +0100
Subject: [PATCH 05/26] feat(compute): Allow specifying the reconfiguration
 concurrency (#10006)

## Problem

We need a higher concurrency during reconfiguration in case of many DBs,
but the instance is already running and used by the client. We can
easily get out of `max_connections` limit, and the current code won't
handle that.

## Summary of changes

Default to 1, but also allow control plane to override this value for
specific projects. It's also recommended to bump
`superuser_reserved_connections` += `reconfigure_concurrency` for such
projects to ensure that we always have enough spare connections for
reconfiguration process to succeed.

Quick workaround for neondatabase/cloud#17846
---
 compute_tools/src/compute.rs  |  7 +------
 control_plane/src/endpoint.rs |  1 +
 libs/compute_api/src/spec.rs  | 25 ++++++++++++++++++++++---
 3 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 0d1e6d680f..d72a04f2f9 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1243,12 +1243,7 @@ impl ComputeNode {
         let postgresql_conf_path = pgdata_path.join("postgresql.conf");
         config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?;
 
-        // TODO(ololobus): We need a concurrency during reconfiguration as well,
-        // but DB is already running and used by user. We can easily get out of
-        // `max_connections` limit, and the current code won't handle that.
-        // let compute_state = self.state.lock().unwrap().clone();
-        // let max_concurrent_connections = self.max_service_connections(&compute_state, &spec);
-        let max_concurrent_connections = 1;
+        let max_concurrent_connections = spec.reconfigure_concurrency;
 
         // Temporarily reset max_cluster_size in config
         // to avoid the possibility of hitting the limit, while we are reconfiguring:
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 1ca6dc43c4..360857f365 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -618,6 +618,7 @@ impl Endpoint {
             pgbouncer_settings: None,
             shard_stripe_size: Some(shard_stripe_size),
             local_proxy_config: None,
+            reconfigure_concurrency: 1,
         };
         let spec_path = self.endpoint_path().join("spec.json");
         std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 8a447563dc..6d9c353cda 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -19,6 +19,10 @@ pub type PgIdent = String;
 /// String type alias representing Postgres extension version
 pub type ExtVersion = String;
 
+fn default_reconfigure_concurrency() -> usize {
+    1
+}
+
 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
@@ -67,7 +71,7 @@ pub struct ComputeSpec {
     pub cluster: Cluster,
     pub delta_operations: Option<Vec<DeltaOp>>,
 
-    /// An optinal hint that can be passed to speed up startup time if we know
+    /// An optional hint that can be passed to speed up startup time if we know
     /// that no pg catalog mutations (like role creation, database creation,
     /// extension creation) need to be done on the actual database to start.
     #[serde(default)] // Default false
@@ -86,9 +90,7 @@ pub struct ComputeSpec {
     // etc. GUCs in cluster.settings. TODO: Once the control plane has been
     // updated to fill these fields, we can make these non optional.
     pub tenant_id: Option<TenantId>,
-
     pub timeline_id: Option<TimelineId>,
-
     pub pageserver_connstring: Option<String>,
 
     #[serde(default)]
@@ -113,6 +115,20 @@ pub struct ComputeSpec {
     /// Local Proxy configuration used for JWT authentication
     #[serde(default)]
     pub local_proxy_config: Option<LocalProxySpec>,
+
+    /// Number of concurrent connections during the parallel RunInEachDatabase
+    /// phase of the apply config process.
+    ///
+    /// We need a higher concurrency during reconfiguration in case of many DBs,
+    /// but instance is already running and used by client. We can easily get out of
+    /// `max_connections` limit, and the current code won't handle that.
+    ///
+    /// Default is 1, but also allow control plane to override this value for specific
+    /// projects. It's also recommended to bump `superuser_reserved_connections` +=
+    /// `reconfigure_concurrency` for such projects to ensure that we always have
+    /// enough spare connections for reconfiguration process to succeed.
+    #[serde(default = "default_reconfigure_concurrency")]
+    pub reconfigure_concurrency: usize,
 }
 
 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -315,6 +331,9 @@ mod tests {
 
         // Features list defaults to empty vector.
         assert!(spec.features.is_empty());
+
+        // Reconfigure concurrency defaults to 1.
+        assert_eq!(spec.reconfigure_concurrency, 1);
     }
 
     #[test]

From c0ba4169676300c72ec3b567996c2604be93b136 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 5 Dec 2024 13:04:33 -0600
Subject: [PATCH 06/26] Add compute_logical_snapshots_bytes metric (#9887)

This metric exposes the size of all non-temporary logical snapshot
files.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/etc/neon_collector.jsonnet              |  1 +
 .../compute_logical_snapshots_bytes.15.sql      |  7 +++++++
 .../compute_logical_snapshots_bytes.libsonnet   | 17 +++++++++++++++++
 .../compute_logical_snapshots_bytes.sql         |  9 +++++++++
 4 files changed, 34 insertions(+)
 create mode 100644 compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql
 create mode 100644 compute/etc/sql_exporter/compute_logical_snapshots_bytes.libsonnet
 create mode 100644 compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql

diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet
index 75d69c7b68..aa6cc1cfc8 100644
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -6,6 +6,7 @@
     import 'sql_exporter/compute_backpressure_throttling_seconds.libsonnet',
     import 'sql_exporter/compute_current_lsn.libsonnet',
     import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
+    import 'sql_exporter/compute_logical_snapshots_bytes.libsonnet',
     import 'sql_exporter/compute_max_connections.libsonnet',
     import 'sql_exporter/compute_receive_lsn.libsonnet',
     import 'sql_exporter/compute_subscriptions_count.libsonnet',
diff --git a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql
new file mode 100644
index 0000000000..73a9c11405
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql
@@ -0,0 +1,7 @@
+SELECT
+  (SELECT current_setting('neon.timeline_id')) AS timeline_id,
+  -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
+  -- These temporary snapshot files are renamed to the actual snapshot files
+  -- after they are completely built. We only WAL-log the completely built
+  -- snapshot files
+  (SELECT COALESCE(sum(size), 0) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes;
diff --git a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.libsonnet b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.libsonnet
new file mode 100644
index 0000000000..8e1792d386
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.libsonnet
@@ -0,0 +1,17 @@
+local neon = import 'neon.libsonnet';
+
+local pg_ls_logicalsnapdir = importstr 'sql_exporter/compute_logical_snapshots_bytes.15.sql';
+local pg_ls_dir = importstr 'sql_exporter/compute_logical_snapshots_bytes.sql';
+
+{
+  metric_name: 'compute_logical_snapshots_bytes',
+  type: 'gauge',
+  help: 'Size of the pg_logical/snapshots directory, not including temporary files',
+  key_labels: [
+    'timeline_id',
+  ],
+  values: [
+    'logical_snapshots_bytes',
+  ],
+  query: if neon.PG_MAJORVERSION_NUM < 15 then pg_ls_dir else pg_ls_logicalsnapdir,
+}
diff --git a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql
new file mode 100644
index 0000000000..16da899de2
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql
@@ -0,0 +1,9 @@
+SELECT
+  (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
+  -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
+  -- These temporary snapshot files are renamed to the actual snapshot files
+  -- after they are completely built. We only WAL-log the completely built
+  -- snapshot files
+  (SELECT COALESCE(sum((pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0)
+    FROM (SELECT * FROM pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name
+  ) AS logical_snapshots_bytes;

From 71f38d135467ef8691f062c62fa5d8f3bf49ea6d Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 5 Dec 2024 11:37:17 -0800
Subject: [PATCH 07/26] feat(pageserver): support schedule gc-compaction
 (#9809)

## Problem

part of https://github.com/neondatabase/neon/issues/9114

gc-compaction can take a long time. This patch adds support for
scheduling a gc-compaction job. The compaction loop will first handle
L0->L1 compaction, and then gc compaction. The scheduled jobs are stored
in a non-persistent queue within the tenant structure.

This will be the building block for the partial compaction trigger -- if
the system determines that we need to do a gc compaction, it will
partition the keyspace and schedule several jobs. Each of these jobs
will run for a short amount of time (i.e, 1 min). L0 compaction will be
prioritized over gc compaction.

## Summary of changes

* Add compaction scheduler in tenant.
* Run scheduled compaction in integration tests.
* Change the manual compaction API to allow schedule a compaction
instead of immediately doing it.
* Add LSN upper bound as gc-compaction parameter. If we schedule partial
compactions, gc_cutoff might move across different runs. Therefore, we
need to pass a pre-determined gc_cutoff beforehand. (TODO: support LSN
lower bound so that we can compact arbitrary "rectangle" in the layer
map)
* Refactor the gc_compaction internal interface.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/http/routes.rs                |  66 +++++--
 pageserver/src/tenant.rs                     | 171 ++++++++++++++++---
 pageserver/src/tenant/timeline.rs            |  29 +++-
 pageserver/src/tenant/timeline/compaction.rs |  58 ++++---
 test_runner/regress/test_compaction.py       |  41 +++--
 5 files changed, 291 insertions(+), 74 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index e04f1460a8..b3981b4a8e 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -87,7 +87,7 @@ use crate::tenant::timeline::offload::offload_timeline;
 use crate::tenant::timeline::offload::OffloadError;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::CompactOptions;
-use crate::tenant::timeline::CompactRange;
+use crate::tenant::timeline::CompactRequest;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
@@ -1978,6 +1978,26 @@ async fn timeline_gc_handler(
     json_response(StatusCode::OK, gc_result)
 }
 
+// Cancel scheduled compaction tasks
+async fn timeline_cancel_compact_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
+    async {
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+        tenant.cancel_scheduled_compaction(timeline_id);
+        json_response(StatusCode::OK, ())
+    }
+    .instrument(info_span!("timeline_cancel_compact", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
+    .await
+}
+
 // Run compaction immediately on given timeline.
 async fn timeline_compact_handler(
     mut request: Request<Body>,
@@ -1987,7 +2007,7 @@ async fn timeline_compact_handler(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
-    let compact_range = json_request_maybe::<Option<CompactRange>>(&mut request).await?;
+    let compact_request = json_request_maybe::<Option<CompactRequest>>(&mut request).await?;
 
     let state = get_state(&request);
 
@@ -2012,22 +2032,42 @@ async fn timeline_compact_handler(
     let wait_until_uploaded =
         parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
 
+    let wait_until_scheduled_compaction_done =
+        parse_query_param::<_, bool>(&request, "wait_until_scheduled_compaction_done")?
+            .unwrap_or(false);
+
     let options = CompactOptions {
-        compact_range,
+        compact_range: compact_request
+            .as_ref()
+            .and_then(|r| r.compact_range.clone()),
+        compact_below_lsn: compact_request.as_ref().and_then(|r| r.compact_below_lsn),
         flags,
     };
 
+    let scheduled = compact_request.map(|r| r.scheduled).unwrap_or(false);
+
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
         let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
-        timeline
-            .compact_with_options(&cancel, options, &ctx)
-            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
-        if wait_until_uploaded {
-            timeline.remote_client.wait_completion().await
-            // XXX map to correct ApiError for the cases where it's due to shutdown
-            .context("wait completion").map_err(ApiError::InternalServerError)?;
+        if scheduled {
+            let tenant = state
+                .tenant_manager
+                .get_attached_tenant_shard(tenant_shard_id)?;
+            let rx = tenant.schedule_compaction(timeline_id, options).await;
+            if wait_until_scheduled_compaction_done {
+                // It is possible that this will take a long time, dropping the HTTP request will not cancel the compaction.
+                rx.await.ok();
+            }
+        } else {
+            timeline
+                .compact_with_options(&cancel, options, &ctx)
+                .await
+                .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            if wait_until_uploaded {
+                timeline.remote_client.wait_completion().await
+                // XXX map to correct ApiError for the cases where it's due to shutdown
+                .context("wait completion").map_err(ApiError::InternalServerError)?;
+            }
         }
         json_response(StatusCode::OK, ())
     }
@@ -3301,6 +3341,10 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
             |r| api_handler(r, timeline_compact_handler),
         )
+        .delete(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
+            |r| api_handler(r, timeline_cancel_compact_handler),
+        )
         .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/offload",
             |r| testing_api_handler("attempt timeline offload", r, timeline_offload_handler),
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 5a9e398586..306ec9f548 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -37,14 +37,18 @@ use remote_timeline_client::manifest::{
 };
 use remote_timeline_client::UploadQueueNotReadyError;
 use std::collections::BTreeMap;
+use std::collections::VecDeque;
 use std::fmt;
 use std::future::Future;
 use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
+use timeline::compaction::ScheduledCompactionTask;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
+use timeline::CompactFlags;
+use timeline::CompactOptions;
 use timeline::ShutdownMode;
 use tokio::io::BufReader;
 use tokio::sync::watch;
@@ -339,6 +343,11 @@ pub struct Tenant {
     /// Overhead of mutex is acceptable because compaction is done with a multi-second period.
     compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
 
+    /// Scheduled compaction tasks. Currently, this can only be populated by triggering
+    /// a manual gc-compaction from the manual compaction API.
+    scheduled_compaction_tasks:
+        std::sync::Mutex<HashMap<TimelineId, VecDeque<ScheduledCompactionTask>>>,
+
     /// If the tenant is in Activating state, notify this to encourage it
     /// to proceed to Active as soon as possible, rather than waiting for lazy
     /// background warmup.
@@ -2953,27 +2962,68 @@ impl Tenant {
 
         for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload
         {
+            // pending_task_left == None: cannot compact, maybe still pending tasks
+            // pending_task_left == Some(true): compaction task left
+            // pending_task_left == Some(false): no compaction task left
             let pending_task_left = if *can_compact {
-                Some(
-                    timeline
-                        .compact(cancel, EnumSet::empty(), ctx)
-                        .instrument(info_span!("compact_timeline", %timeline_id))
-                        .await
-                        .inspect_err(|e| match e {
-                            timeline::CompactionError::ShuttingDown => (),
-                            timeline::CompactionError::Offload(_) => {
-                                // Failures to offload timelines do not trip the circuit breaker, because
-                                // they do not do lots of writes the way compaction itself does: it is cheap
-                                // to retry, and it would be bad to stop all compaction because of an issue with offloading.
+                let has_pending_l0_compaction_task = timeline
+                    .compact(cancel, EnumSet::empty(), ctx)
+                    .instrument(info_span!("compact_timeline", %timeline_id))
+                    .await
+                    .inspect_err(|e| match e {
+                        timeline::CompactionError::ShuttingDown => (),
+                        timeline::CompactionError::Offload(_) => {
+                            // Failures to offload timelines do not trip the circuit breaker, because
+                            // they do not do lots of writes the way compaction itself does: it is cheap
+                            // to retry, and it would be bad to stop all compaction because of an issue with offloading.
+                        }
+                        timeline::CompactionError::Other(e) => {
+                            self.compaction_circuit_breaker
+                                .lock()
+                                .unwrap()
+                                .fail(&CIRCUIT_BREAKERS_BROKEN, e);
+                        }
+                    })?;
+                if has_pending_l0_compaction_task {
+                    Some(true)
+                } else {
+                    let has_pending_scheduled_compaction_task;
+                    let next_scheduled_compaction_task = {
+                        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+                        if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) {
+                            let next_task = tline_pending_tasks.pop_front();
+                            has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty();
+                            next_task
+                        } else {
+                            has_pending_scheduled_compaction_task = false;
+                            None
+                        }
+                    };
+                    if let Some(mut next_scheduled_compaction_task) = next_scheduled_compaction_task
+                    {
+                        if !next_scheduled_compaction_task
+                            .options
+                            .flags
+                            .contains(CompactFlags::EnhancedGcBottomMostCompaction)
+                        {
+                            warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
+                        } else {
+                            let _ = timeline
+                                .compact_with_options(
+                                    cancel,
+                                    next_scheduled_compaction_task.options,
+                                    ctx,
+                                )
+                                .instrument(info_span!("scheduled_compact_timeline", %timeline_id))
+                                .await?;
+                            if let Some(tx) = next_scheduled_compaction_task.result_tx.take() {
+                                // TODO: we can send compaction statistics in the future
+                                tx.send(()).ok();
                             }
-                            timeline::CompactionError::Other(e) => {
-                                self.compaction_circuit_breaker
-                                    .lock()
-                                    .unwrap()
-                                    .fail(&CIRCUIT_BREAKERS_BROKEN, e);
-                            }
-                        })?,
-                )
+                        }
+                    }
+                    Some(has_pending_scheduled_compaction_task)
+                }
             } else {
                 None
             };
@@ -2993,6 +3043,36 @@ impl Tenant {
         Ok(has_pending_task)
     }
 
+    /// Cancel scheduled compaction tasks
+    pub(crate) fn cancel_scheduled_compaction(
+        &self,
+        timeline_id: TimelineId,
+    ) -> Vec<ScheduledCompactionTask> {
+        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+        if let Some(tline_pending_tasks) = guard.get_mut(&timeline_id) {
+            let current_tline_pending_tasks = std::mem::take(tline_pending_tasks);
+            current_tline_pending_tasks.into_iter().collect()
+        } else {
+            Vec::new()
+        }
+    }
+
+    /// Schedule a compaction task for a timeline.
+    pub(crate) async fn schedule_compaction(
+        &self,
+        timeline_id: TimelineId,
+        options: CompactOptions,
+    ) -> tokio::sync::oneshot::Receiver<()> {
+        let (tx, rx) = tokio::sync::oneshot::channel();
+        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+        let tline_pending_tasks = guard.entry(timeline_id).or_default();
+        tline_pending_tasks.push_back(ScheduledCompactionTask {
+            options,
+            result_tx: Some(tx),
+        });
+        rx
+    }
+
     // Call through to all timelines to freeze ephemeral layers if needed.  Usually
     // this happens during ingest: this background housekeeping is for freezing layers
     // that are open but haven't been written to for some time.
@@ -4005,6 +4085,7 @@ impl Tenant {
                 // use an extremely long backoff.
                 Some(Duration::from_secs(3600 * 24)),
             )),
+            scheduled_compaction_tasks: Mutex::new(Default::default()),
             activate_now_sem: tokio::sync::Semaphore::new(0),
             attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
             cancel: CancellationToken::default(),
@@ -9163,6 +9244,7 @@ mod tests {
                 CompactOptions {
                     flags: dryrun_flags,
                     compact_range: None,
+                    compact_below_lsn: None,
                 },
                 &ctx,
             )
@@ -9399,6 +9481,7 @@ mod tests {
                 CompactOptions {
                     flags: dryrun_flags,
                     compact_range: None,
+                    compact_below_lsn: None,
                 },
                 &ctx,
             )
@@ -9885,7 +9968,15 @@ mod tests {
 
         // Do a partial compaction on key range 0..2
         tline
-            .partial_compact_with_gc(get_key(0)..get_key(2), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(0)..get_key(2)).into()),
+                    compact_below_lsn: None,
+                },
+                &ctx,
+            )
             .await
             .unwrap();
         let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -9924,7 +10015,15 @@ mod tests {
 
         // Do a partial compaction on key range 2..4
         tline
-            .partial_compact_with_gc(get_key(2)..get_key(4), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(2)..get_key(4)).into()),
+                    compact_below_lsn: None,
+                },
+                &ctx,
+            )
             .await
             .unwrap();
         let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -9968,7 +10067,15 @@ mod tests {
 
         // Do a partial compaction on key range 4..9
         tline
-            .partial_compact_with_gc(get_key(4)..get_key(9), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(4)..get_key(9)).into()),
+                    compact_below_lsn: None,
+                },
+                &ctx,
+            )
             .await
             .unwrap();
         let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -10011,7 +10118,15 @@ mod tests {
 
         // Do a partial compaction on key range 9..10
         tline
-            .partial_compact_with_gc(get_key(9)..get_key(10), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(9)..get_key(10)).into()),
+                    compact_below_lsn: None,
+                },
+                &ctx,
+            )
             .await
             .unwrap();
         let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -10059,7 +10174,15 @@ mod tests {
 
         // Do a partial compaction on key range 0..10, all image layers below LSN 20 can be replaced with new ones.
         tline
-            .partial_compact_with_gc(get_key(0)..get_key(10), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(0)..get_key(10)).into()),
+                    compact_below_lsn: None,
+                },
+                &ctx,
+            )
             .await
             .unwrap();
         let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fc741826ab..fc69525bf4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -768,7 +768,7 @@ pub enum GetLogicalSizePriority {
     Background,
 }
 
-#[derive(enumset::EnumSetType)]
+#[derive(Debug, enumset::EnumSetType)]
 pub(crate) enum CompactFlags {
     ForceRepartition,
     ForceImageLayerCreation,
@@ -777,6 +777,16 @@ pub(crate) enum CompactFlags {
     DryRun,
 }
 
+#[serde_with::serde_as]
+#[derive(Debug, Clone, serde::Deserialize)]
+pub(crate) struct CompactRequest {
+    pub compact_range: Option<CompactRange>,
+    pub compact_below_lsn: Option<Lsn>,
+    /// Whether the compaction job should be scheduled.
+    #[serde(default)]
+    pub scheduled: bool,
+}
+
 #[serde_with::serde_as]
 #[derive(Debug, Clone, serde::Deserialize)]
 pub(crate) struct CompactRange {
@@ -786,10 +796,24 @@ pub(crate) struct CompactRange {
     pub end: Key,
 }
 
-#[derive(Clone, Default)]
+impl From<Range<Key>> for CompactRange {
+    fn from(range: Range<Key>) -> Self {
+        CompactRange {
+            start: range.start,
+            end: range.end,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Default)]
 pub(crate) struct CompactOptions {
     pub flags: EnumSet<CompactFlags>,
+    /// If set, the compaction will only compact the key range specified by this option.
+    /// This option is only used by GC compaction.
     pub compact_range: Option<CompactRange>,
+    /// If set, the compaction will only compact the LSN below this value.
+    /// This option is only used by GC compaction.
+    pub compact_below_lsn: Option<Lsn>,
 }
 
 impl std::fmt::Debug for Timeline {
@@ -1604,6 +1628,7 @@ impl Timeline {
             CompactOptions {
                 flags,
                 compact_range: None,
+                compact_below_lsn: None,
             },
             ctx,
         )
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index ecd68ba55e..8ececa2bfb 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -16,7 +16,6 @@ use super::{
 
 use anyhow::{anyhow, bail, Context};
 use bytes::Bytes;
-use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
 use pageserver_api::key::KEY_SIZE;
@@ -64,6 +63,12 @@ use super::CompactionError;
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
 const COMPACTION_DELTA_THRESHOLD: usize = 5;
 
+/// A scheduled compaction task.
+pub struct ScheduledCompactionTask {
+    pub options: CompactOptions,
+    pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
+}
+
 pub struct GcCompactionJobDescription {
     /// All layers to read in the compaction job
     selected_layers: Vec<Layer>,
@@ -1746,24 +1751,6 @@ impl Timeline {
         Ok(())
     }
 
-    pub(crate) async fn compact_with_gc(
-        self: &Arc<Self>,
-        cancel: &CancellationToken,
-        options: CompactOptions,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        self.partial_compact_with_gc(
-            options
-                .compact_range
-                .map(|range| range.start..range.end)
-                .unwrap_or_else(|| Key::MIN..Key::MAX),
-            cancel,
-            options.flags,
-            ctx,
-        )
-        .await
-    }
-
     /// An experimental compaction building block that combines compaction with garbage collection.
     ///
     /// The current implementation picks all delta + image layers that are below or intersecting with
@@ -1771,17 +1758,19 @@ impl Timeline {
     /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
     /// and create delta layers with all deltas >= gc horizon.
     ///
-    /// If `key_range` is provided, it will only compact the keys within the range, aka partial compaction.
+    /// If `options.compact_range` is provided, it will only compact the keys within the range, aka partial compaction.
     /// Partial compaction will read and process all layers overlapping with the key range, even if it might
     /// contain extra keys. After the gc-compaction phase completes, delta layers that are not fully contained
     /// within the key range will be rewritten to ensure they do not overlap with the delta layers. Providing
     /// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not
     /// part of the range.
-    pub(crate) async fn partial_compact_with_gc(
+    ///
+    /// If `options.compact_below_lsn` is provided, the compaction will only compact layers below or intersect with
+    /// the LSN. Otherwise, it will use the gc cutoff by default.
+    pub(crate) async fn compact_with_gc(
         self: &Arc<Self>,
-        compaction_key_range: Range<Key>,
         cancel: &CancellationToken,
-        flags: EnumSet<CompactFlags>,
+        options: CompactOptions,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Block other compaction/GC tasks from running for now. GC-compaction could run along
@@ -1803,6 +1792,12 @@ impl Timeline {
         )
         .await?;
 
+        let flags = options.flags;
+        let compaction_key_range = options
+            .compact_range
+            .map(|range| range.start..range.end)
+            .unwrap_or_else(|| Key::MIN..Key::MAX);
+
         let dry_run = flags.contains(CompactFlags::DryRun);
 
         if compaction_key_range == (Key::MIN..Key::MAX) {
@@ -1826,7 +1821,18 @@ impl Timeline {
             let layers = guard.layer_map()?;
             let gc_info = self.gc_info.read().unwrap();
             let mut retain_lsns_below_horizon = Vec::new();
-            let gc_cutoff = gc_info.cutoffs.select_min();
+            let gc_cutoff = {
+                let real_gc_cutoff = gc_info.cutoffs.select_min();
+                // The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for
+                // each of the retain_lsn. Therefore, if the user-provided `compact_below_lsn` is larger than the real gc cutoff, we will use
+                // the real cutoff.
+                let mut gc_cutoff = options.compact_below_lsn.unwrap_or(real_gc_cutoff);
+                if gc_cutoff > real_gc_cutoff {
+                    warn!("provided compact_below_lsn={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff);
+                    gc_cutoff = real_gc_cutoff;
+                }
+                gc_cutoff
+            };
             for (lsn, _timeline_id, _is_offloaded) in &gc_info.retain_lsns {
                 if lsn < &gc_cutoff {
                     retain_lsns_below_horizon.push(*lsn);
@@ -1846,7 +1852,7 @@ impl Timeline {
                 .map(|desc| desc.get_lsn_range().end)
                 .max()
             else {
-                info!("no layers to compact with gc");
+                info!("no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", gc_cutoff);
                 return Ok(());
             };
             // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
@@ -1869,7 +1875,7 @@ impl Timeline {
                 }
             }
             if selected_layers.is_empty() {
-                info!("no layers to compact with gc");
+                info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compaction_key_range.start, compaction_key_range.end);
                 return Ok(());
             }
             retain_lsns_below_horizon.sort();
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index b6741aed68..de6653eb3f 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -15,7 +15,7 @@ from fixtures.pageserver.http import PageserverApiException
 from fixtures.utils import skip_in_debug_build, wait_until
 from fixtures.workload import Workload
 
-AGGRESIVE_COMPACTION_TENANT_CONF = {
+AGGRESSIVE_COMPACTION_TENANT_CONF = {
     # Disable gc and compaction. The test runs compaction manually.
     "gc_period": "0s",
     "compaction_period": "0s",
@@ -24,6 +24,7 @@ AGGRESIVE_COMPACTION_TENANT_CONF = {
     # Compact small layers
     "compaction_target_size": 1024**2,
     "image_creation_threshold": 2,
+    # "lsn_lease_length": "0s", -- TODO: would cause branch creation errors, should fix later
 }
 
 
@@ -51,7 +52,7 @@ def test_pageserver_compaction_smoke(
 page_cache_size=10
 """
 
-    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
+    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESSIVE_COMPACTION_TENANT_CONF)
 
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
@@ -120,14 +121,25 @@ page_cache_size=10
     assert vectored_average < 8
 
 
+@skip_in_debug_build("only run with release build")
 def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
+    SMOKE_CONF = {
+        # Run both gc and gc-compaction.
+        "gc_period": "5s",
+        "compaction_period": "5s",
+        # No PiTR interval and small GC horizon
+        "pitr_interval": "0s",
+        "gc_horizon": f"{1024 ** 2}",
+        "lsn_lease_length": "0s",
+    }
+
+    env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF)
 
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
-    row_count = 1000
-    churn_rounds = 10
+    row_count = 10000
+    churn_rounds = 50
 
     ps_http = env.pageserver.http_client()
 
@@ -141,20 +153,27 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
         if i % 10 == 0:
             log.info(f"Running churn round {i}/{churn_rounds} ...")
 
-        workload.churn_rows(row_count, env.pageserver.id)
-        # Force L0 compaction to ensure the number of layers is within bounds, so that gc-compaction can run.
-        ps_http.timeline_compact(tenant_id, timeline_id, force_l0_compaction=True)
-        assert ps_http.perf_info(tenant_id, timeline_id)[0]["num_of_l0"] <= 1
         ps_http.timeline_compact(
             tenant_id,
             timeline_id,
             enhanced_gc_bottom_most_compaction=True,
             body={
-                "start": "000000000000000000000000000000000000",
-                "end": "030000000000000000000000000000000000",
+                "scheduled": True,
+                "compact_range": {
+                    "start": "000000000000000000000000000000000000",
+                    # skip the SLRU range for now -- it races with get-lsn-by-timestamp, TODO: fix this
+                    "end": "010000000000000000000000000000000000",
+                },
             },
         )
 
+        workload.churn_rows(row_count, env.pageserver.id)
+
+    # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
+    env.pageserver.assert_log_contains(
+        "scheduled_compact_timeline.*picked .* layers for compaction"
+    )
+
     log.info("Validating at workload end ...")
     workload.validate(env.pageserver.id)
 

From 6331cb216195658b7926cadb8045759aa71c4575 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 5 Dec 2024 13:42:52 -0600
Subject: [PATCH 08/26] Bump anyhow to 1.0.94 (#10028)

We were over a year out of date.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 62f06d45bd..f6e0024d87 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -133,9 +133,9 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.71"
+version = "1.0.94"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8"
+checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7"
 dependencies = [
  "backtrace",
 ]

From 6ff4175fd7e62577ad0a7d1bba4fc3b6237ac764 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 5 Dec 2024 14:30:35 -0600
Subject: [PATCH 09/26] Send Content-Type header on reconfigure request from
 neon_local (#10029)

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 control_plane/src/endpoint.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 360857f365..35067c95b6 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -53,6 +53,7 @@ use compute_api::spec::Role;
 use nix::sys::signal::kill;
 use nix::sys::signal::Signal;
 use pageserver_api::shard::ShardStripeSize;
+use reqwest::header::CONTENT_TYPE;
 use serde::{Deserialize, Serialize};
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
@@ -818,6 +819,7 @@ impl Endpoint {
                 self.http_address.ip(),
                 self.http_address.port()
             ))
+            .header(CONTENT_TYPE.as_str(), "application/json")
             .body(format!(
                 "{{\"spec\":{}}}",
                 serde_json::to_string_pretty(&spec)?

From d1ab7471e2d6603a5680ba33f749adb743c2154b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 5 Dec 2024 21:51:57 +0100
Subject: [PATCH 10/26] Fix desc_str for Azure container (#10021)

Small logs fix I've noticed while working on
https://github.com/neondatabase/cloud/issues/19963 .
---
 storage_scrubber/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 1fe4fc58cd..be526daaf0 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -268,7 +268,7 @@ impl BucketConfig {
                 config.bucket_name, config.bucket_region
             ),
             RemoteStorageKind::AzureContainer(config) => format!(
-                "bucket {}, storage account {:?}, region {}",
+                "container {}, storage account {:?}, region {}",
                 config.container_name, config.storage_account, config.container_region
             ),
         }

From 56f867bde5324b0d3333faaf7360aa07245f68c0 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 6 Dec 2024 08:22:22 +0100
Subject: [PATCH 11/26] pageserver: only zero truncated FSM page on owning
 shard (#10032)

## Problem

FSM pages are managed like regular relation pages, and owned by a single
shard. However, when truncating the FSM relation the last FSM page was
zeroed out on all shards. This is unnecessary and potentially confusing.

The superfluous keys will be removed during compactions, as they do not
belong on these shards.

Resolves #10027.

## Summary of changes

Only zero out the truncated FSM page on the owning shard.
---
 pageserver/src/walingest.rs | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 93ae88936f..30c8965d51 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -582,18 +582,21 @@ impl WalIngest {
                 forknum: FSM_FORKNUM,
             };
 
+            // Zero out the last remaining FSM page, if this shard owns it. We are not precise here,
+            // and instead of digging in the FSM bitmap format we just clear the whole page.
             let fsm_logical_page_no = blkno / pg_constants::SLOTS_PER_FSM_PAGE;
             let mut fsm_physical_page_no = fsm_logical_to_physical(fsm_logical_page_no);
-            if blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
-                // Tail of last remaining FSM page has to be zeroed.
-                // We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
+            if blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0
+                && self
+                    .shard
+                    .is_key_local(&rel_block_to_key(rel, fsm_physical_page_no))
+            {
                 modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
                 fsm_physical_page_no += 1;
             }
-            // TODO: re-examine the None case here wrt. sharding; should we error?
+            // Truncate this shard's view of the FSM relation size, if it even has one.
             let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
             if nblocks > fsm_physical_page_no {
-                // check if something to do: FSM is larger than truncate position
                 self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
                     .await?;
             }
@@ -617,7 +620,7 @@ impl WalIngest {
             // tail bits in the last remaining map page, representing truncated heap
             // blocks, need to be cleared. This is not only tidy, but also necessary
             // because we don't get a chance to clear the bits if the heap is extended
-            // again.
+            // again. Only do this on the shard that owns the page.
             if (trunc_byte != 0 || trunc_offs != 0)
                 && self.shard.is_key_local(&rel_block_to_key(rel, vm_page_no))
             {
@@ -631,10 +634,9 @@ impl WalIngest {
                 )?;
                 vm_page_no += 1;
             }
-            // TODO: re-examine the None case here wrt. sharding; should we error?
+            // Truncate this shard's view of the VM relation size, if it even has one.
             let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
             if nblocks > vm_page_no {
-                // check if something to do: VM is larger than truncate position
                 self.put_rel_truncation(modification, rel, vm_page_no, ctx)
                     .await?;
             }

From ec4072f84577eeb2a92d97fa77281efe50325730 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 6 Dec 2024 11:12:39 +0100
Subject: [PATCH 12/26] pageserver: add `wait_until_flushed` parameter for
 timeline checkpoint (#10013)

## Problem

I'm writing an ingest benchmark in #9812. To time S3 uploads, I need to
schedule a flush of the Pageserver's in-memory layer, but don't actually
want to wait around for it to complete (which will take a minute).

## Summary of changes

Add a parameter `wait_until_flush` (default `true`) for
`timeline/checkpoint` to control whether to wait for the flush to
complete.
---
 pageserver/src/http/routes.rs           | 12 ++++++++----
 pageserver/src/tenant/timeline.rs       | 26 ++++++++++++++++---------
 test_runner/fixtures/pageserver/http.py |  5 ++++-
 3 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index b3981b4a8e..b7fddb065c 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2148,16 +2148,20 @@ async fn timeline_checkpoint_handler(
     // By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload.
     let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true);
 
+    let wait_until_flushed: bool =
+        parse_query_param(&request, "wait_until_flushed")?.unwrap_or(true);
+
     let wait_until_uploaded =
         parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
         let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
-        timeline
-            .freeze_and_flush()
-            .await
-            .map_err(|e| {
+        if wait_until_flushed {
+            timeline.freeze_and_flush().await
+        } else {
+            timeline.freeze().await.and(Ok(()))
+        }.map_err(|e| {
                 match e {
                     tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
                     other => ApiError::InternalServerError(other.into()),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fc69525bf4..aab6703a3c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1457,23 +1457,31 @@ impl Timeline {
         Ok(lease)
     }
 
-    /// Flush to disk all data that was written with the put_* functions
+    /// Freeze the current open in-memory layer. It will be written to disk on next iteration.
+    /// Returns the flush request ID which can be awaited with wait_flush_completion().
+    #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
+    pub(crate) async fn freeze(&self) -> Result<u64, FlushLayerError> {
+        self.freeze0().await
+    }
+
+    /// Freeze and flush the open in-memory layer, waiting for it to be written to disk.
     #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
     pub(crate) async fn freeze_and_flush(&self) -> Result<(), FlushLayerError> {
         self.freeze_and_flush0().await
     }
 
+    /// Freeze the current open in-memory layer. It will be written to disk on next iteration.
+    /// Returns the flush request ID which can be awaited with wait_flush_completion().
+    pub(crate) async fn freeze0(&self) -> Result<u64, FlushLayerError> {
+        let mut g = self.write_lock.lock().await;
+        let to_lsn = self.get_last_record_lsn();
+        self.freeze_inmem_layer_at(to_lsn, &mut g).await
+    }
+
     // This exists to provide a non-span creating version of `freeze_and_flush` we can call without
     // polluting the span hierarchy.
     pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
-        let token = {
-            // Freeze the current open in-memory layer. It will be written to disk on next
-            // iteration.
-            let mut g = self.write_lock.lock().await;
-
-            let to_lsn = self.get_last_record_lsn();
-            self.freeze_inmem_layer_at(to_lsn, &mut g).await?
-        };
+        let token = self.freeze0().await?;
         self.wait_flush_completion(token).await
     }
 
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 4cf3ece396..0832eac22f 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -850,6 +850,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         force_repartition=False,
         force_image_layer_creation=False,
         force_l0_compaction=False,
+        wait_until_flushed=True,
         wait_until_uploaded=False,
         compact: bool | None = None,
         **kwargs,
@@ -862,6 +863,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
             query["force_image_layer_creation"] = "true"
         if force_l0_compaction:
             query["force_l0_compaction"] = "true"
+        if not wait_until_flushed:
+            query["wait_until_flushed"] = "false"
         if wait_until_uploaded:
             query["wait_until_uploaded"] = "true"
 
@@ -869,7 +872,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
             query["compact"] = "true" if compact else "false"
 
         log.info(
-            f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}, wait_until_uploaded={wait_until_uploaded}"
+            f"Requesting checkpoint: tenant={tenant_id} timeline={timeline_id} wait_until_flushed={wait_until_flushed} wait_until_uploaded={wait_until_uploaded} compact={compact}"
         )
         res = self.put(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",

From 3f1c5429577ca1dee8c5e0955e4072cee2a13eca Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 6 Dec 2024 10:21:52 +0000
Subject: [PATCH 13/26] pageserver: add disk consistent and remote lsn metrics
 (#10005)

## Problem

There's no metrics for disk consistent LSN and remote LSN. This stuff is
useful when looking at ingest performance.

## Summary of changes

Two per timeline metrics are added: `pageserver_disk_consistent_lsn` and
`pageserver_projected_remote_consistent_lsn`. I went for the projected
remote lsn instead of the visible one
because that more closely matches remote storage write tput. Ideally we
would have both, but these metrics are expensive.
---
 pageserver/src/metrics.rs                     | 46 +++++++++++++++++--
 .../src/tenant/remote_timeline_client.rs      |  3 ++
 pageserver/src/tenant/timeline.rs             |  8 +++-
 test_runner/fixtures/metrics.py               |  2 +
 4 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 998c15ccaf..e3b6f43bc4 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -464,6 +464,24 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static DISK_CONSISTENT_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_disk_consistent_lsn",
+        "Disk consistent LSN grouped by timeline",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static PROJECTED_REMOTE_CONSISTENT_LSN: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_projected_remote_consistent_lsn",
+        "Projected remote consistent LSN grouped by timeline",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_pitr_history_size",
@@ -2394,7 +2412,8 @@ pub(crate) struct TimelineMetrics {
     pub load_layer_map_histo: StorageTimeMetrics,
     pub garbage_collect_histo: StorageTimeMetrics,
     pub find_gc_cutoffs_histo: StorageTimeMetrics,
-    pub last_record_gauge: IntGauge,
+    pub last_record_lsn_gauge: IntGauge,
+    pub disk_consistent_lsn_gauge: IntGauge,
     pub pitr_history_size: UIntGauge,
     pub archival_size: UIntGauge,
     pub(crate) layer_size_image: UIntGauge,
@@ -2475,7 +2494,11 @@ impl TimelineMetrics {
             &shard_id,
             &timeline_id,
         );
-        let last_record_gauge = LAST_RECORD_LSN
+        let last_record_lsn_gauge = LAST_RECORD_LSN
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
+        let disk_consistent_lsn_gauge = DISK_CONSISTENT_LSN
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
 
@@ -2578,7 +2601,8 @@ impl TimelineMetrics {
             garbage_collect_histo,
             find_gc_cutoffs_histo,
             load_layer_map_histo,
-            last_record_gauge,
+            last_record_lsn_gauge,
+            disk_consistent_lsn_gauge,
             pitr_history_size,
             archival_size,
             layer_size_image,
@@ -2642,6 +2666,7 @@ impl TimelineMetrics {
         let timeline_id = &self.timeline_id;
         let shard_id = &self.shard_id;
         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         {
@@ -2805,6 +2830,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
     calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
     bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
     bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
+    pub(crate) projected_remote_consistent_lsn_gauge: UIntGauge,
 }
 
 impl RemoteTimelineClientMetrics {
@@ -2819,6 +2845,10 @@ impl RemoteTimelineClientMetrics {
                 .unwrap(),
         );
 
+        let projected_remote_consistent_lsn_gauge = PROJECTED_REMOTE_CONSISTENT_LSN
+            .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
+            .unwrap();
+
         RemoteTimelineClientMetrics {
             tenant_id: tenant_id_str,
             shard_id: shard_id_str,
@@ -2827,6 +2857,7 @@ impl RemoteTimelineClientMetrics {
             bytes_started_counter: Mutex::new(HashMap::default()),
             bytes_finished_counter: Mutex::new(HashMap::default()),
             remote_physical_size_gauge,
+            projected_remote_consistent_lsn_gauge,
         }
     }
 
@@ -3040,6 +3071,7 @@ impl Drop for RemoteTimelineClientMetrics {
             calls,
             bytes_started_counter,
             bytes_finished_counter,
+            projected_remote_consistent_lsn_gauge,
         } = self;
         for ((a, b), _) in calls.get_mut().unwrap().drain() {
             let mut res = [Ok(()), Ok(())];
@@ -3069,6 +3101,14 @@ impl Drop for RemoteTimelineClientMetrics {
             let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
             let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         }
+        {
+            let _ = projected_remote_consistent_lsn_gauge;
+            let _ = PROJECTED_REMOTE_CONSISTENT_LSN.remove_label_values(&[
+                tenant_id,
+                shard_id,
+                timeline_id,
+            ]);
+        }
     }
 }
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 89b935947d..20e0536a00 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2192,6 +2192,9 @@ impl RemoteTimelineClient {
                     upload_queue.clean.1 = Some(task.task_id);
 
                     let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
+                    self.metrics
+                        .projected_remote_consistent_lsn_gauge
+                        .set(lsn.0);
 
                     if self.generation.is_none() {
                         // Legacy mode: skip validating generation
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index aab6703a3c..bf3d7a74a3 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2392,7 +2392,7 @@ impl Timeline {
 
             result
                 .metrics
-                .last_record_gauge
+                .last_record_lsn_gauge
                 .set(disk_consistent_lsn.0 as i64);
             result
         })
@@ -3514,7 +3514,7 @@ impl Timeline {
     pub(crate) fn finish_write(&self, new_lsn: Lsn) {
         assert!(new_lsn.is_aligned());
 
-        self.metrics.last_record_gauge.set(new_lsn.0 as i64);
+        self.metrics.last_record_lsn_gauge.set(new_lsn.0 as i64);
         self.last_record_lsn.advance(new_lsn);
     }
 
@@ -3882,6 +3882,10 @@ impl Timeline {
     fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
         let old_value = self.disk_consistent_lsn.fetch_max(new_value);
         assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}");
+
+        self.metrics
+            .disk_consistent_lsn_gauge
+            .set(new_value.0 as i64);
         new_value != old_value
     }
 
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index ffdbd988a5..1278ed1aef 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -152,6 +152,8 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     "pageserver_resident_physical_size",
     "pageserver_io_operations_bytes_total",
     "pageserver_last_record_lsn",
+    "pageserver_disk_consistent_lsn",
+    "pageserver_projected_remote_consistent_lsn",
     "pageserver_standby_horizon",
     "pageserver_smgr_query_seconds_bucket",
     "pageserver_smgr_query_seconds_count",

From 7838659197e40ecdb0735c01cb21dd2298492d24 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 6 Dec 2024 11:24:13 +0100
Subject: [PATCH 14/26] pageserver: assert that keys belong to shard (#9943)

We've seen cases where stray keys end up on the wrong shard. This
shouldn't happen. Add debug assertions to prevent this. In release
builds, we should be lenient in order to handle changing key ownership
policies.

Touches #9914.
---
 libs/pageserver_api/src/shard.rs             |  5 +++--
 libs/utils/src/shard.rs                      |  6 ++++++
 pageserver/src/tenant/timeline.rs            | 19 ++++++++++++++++++-
 pageserver/src/tenant/timeline/compaction.rs | 16 +++++++++++-----
 4 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index a5c94a82c1..cf0cd3a46b 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -158,7 +158,8 @@ impl ShardIdentity {
         key_to_shard_number(self.count, self.stripe_size, key)
     }
 
-    /// Return true if the key should be ingested by this shard
+    /// Return true if the key is stored only on this shard. This does not include
+    /// global keys, see is_key_global().
     ///
     /// Shards must ingest _at least_ keys which return true from this check.
     pub fn is_key_local(&self, key: &Key) -> bool {
@@ -171,7 +172,7 @@ impl ShardIdentity {
     }
 
     /// Return true if the key should be stored on all shards, not just one.
-    fn is_key_global(&self, key: &Key) -> bool {
+    pub fn is_key_global(&self, key: &Key) -> bool {
         if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() {
             // Special keys that are only stored on shard 0
             false
diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs
index 782cddc599..6352ea9f92 100644
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -164,6 +164,12 @@ impl TenantShardId {
     }
 }
 
+impl std::fmt::Display for ShardNumber {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 impl std::fmt::Display for ShardSlug<'_> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index bf3d7a74a3..0657d1af3a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -53,7 +53,7 @@ use utils::{
     postgres_client::PostgresClientProtocol,
     sync::gate::{Gate, GateGuard},
 };
-use wal_decoder::serialized_batch::SerializedValueBatch;
+use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
 
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
@@ -5924,6 +5924,23 @@ impl<'a> TimelineWriter<'a> {
             return Ok(());
         }
 
+        // In debug builds, assert that we don't write any keys that don't belong to this shard.
+        // We don't assert this in release builds, since key ownership policies may change over
+        // time. Stray keys will be removed during compaction.
+        if cfg!(debug_assertions) {
+            for metadata in &batch.metadata {
+                if let ValueMeta::Serialized(metadata) = metadata {
+                    let key = Key::from_compact(metadata.key);
+                    assert!(
+                        self.shard_identity.is_key_local(&key)
+                            || self.shard_identity.is_key_global(&key),
+                        "key {key} does not belong on shard {}",
+                        self.shard_identity.shard_index()
+                    );
+                }
+            }
+        }
+
         let batch_max_lsn = batch.max_lsn;
         let buf_size: u64 = batch.buffer_size() as u64;
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 8ececa2bfb..7f86ede043 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1179,11 +1179,12 @@ impl Timeline {
                     .await
                     .map_err(CompactionError::Other)?;
             } else {
-                debug!(
-                    "Dropping key {} during compaction (it belongs on shard {:?})",
-                    key,
-                    self.shard_identity.get_shard_number(&key)
-                );
+                let shard = self.shard_identity.shard_index();
+                let owner = self.shard_identity.get_shard_number(&key);
+                if cfg!(debug_assertions) {
+                    panic!("key {key} does not belong on shard {shard}, owned by {owner}");
+                }
+                debug!("dropping key {key} during compaction (it belongs on shard {owner})");
             }
 
             if !new_layers.is_empty() {
@@ -2054,6 +2055,11 @@ impl Timeline {
                 // This is not handled in the filter iterator because shard is determined by hash.
                 // Therefore, it does not give us any performance benefit to do things like skip
                 // a whole layer file as handling key spaces (ranges).
+                if cfg!(debug_assertions) {
+                    let shard = self.shard_identity.shard_index();
+                    let owner = self.shard_identity.get_shard_number(&key);
+                    panic!("key {key} does not belong on shard {shard}, owned by {owner}");
+                }
                 continue;
             }
             if !job_desc.compaction_key_range.contains(&key) {

From fa07097f2ff12b6560f4122e0654b24e5f9561e2 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Fri, 6 Dec 2024 12:44:50 +0100
Subject: [PATCH 15/26] chore: Reorganize and refresh CODEOWNERS (#10008)

## Problem

We didn't have a codeowner for `/compute`, so nobody was auto-assigned
for PRs like #9973

## Summary of changes

While on it:
1. Group codeowners into sections.
2. Remove control plane from the `/compute_tools` because it's primarily
the internal `compute_ctl` code.
3. Add control plane (and compute) to `/libs/compute_api` because that's
the shared public interface of the compute.
---
 CODEOWNERS | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index f41462c98b..71b5e65f94 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,16 +1,29 @@
-/.github/ @neondatabase/developer-productivity
-/compute_tools/ @neondatabase/control-plane @neondatabase/compute
-/libs/pageserver_api/ @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
-/libs/proxy/ @neondatabase/proxy
-/libs/remote_storage/ @neondatabase/storage
-/libs/safekeeper_api/ @neondatabase/storage
+# Autoscaling
 /libs/vm_monitor/ @neondatabase/autoscaling
-/pageserver/ @neondatabase/storage
+
+# DevProd
+/.github/ @neondatabase/developer-productivity
+
+# Compute
 /pgxn/ @neondatabase/compute
-/pgxn/neon/ @neondatabase/compute @neondatabase/storage
+/vendor/ @neondatabase/compute
+/compute/ @neondatabase/compute
+/compute_tools/ @neondatabase/compute
+
+# Proxy
+/libs/proxy/ @neondatabase/proxy
 /proxy/ @neondatabase/proxy
+
+# Storage
+/pageserver/ @neondatabase/storage
 /safekeeper/ @neondatabase/storage
 /storage_controller @neondatabase/storage
 /storage_scrubber @neondatabase/storage
-/vendor/ @neondatabase/compute
+/libs/pageserver_api/ @neondatabase/storage
+/libs/remote_storage/ @neondatabase/storage
+/libs/safekeeper_api/ @neondatabase/storage
+
+# Shared
+/pgxn/neon/ @neondatabase/compute @neondatabase/storage
+/libs/compute_api/ @neondatabase/compute @neondatabase/control-plane
+/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage

From cc70fc802d2107122b330dba6ce8e2d8f8799189 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 6 Dec 2024 12:51:41 +0000
Subject: [PATCH 16/26] pageserver: add metric for number of wal records
 received by each shard (#10035)

## Problem

With the current metrics we can't identify which shards are ingesting
data at any given time.

## Summary of changes

Add a metric for the number of wal records received for processing by
each shard. This is per (tenant, timeline, shard).
---
 pageserver/src/metrics.rs                     | 20 +++++++++++++++++++
 .../walreceiver/walreceiver_connection.rs     |  8 ++++++++
 test_runner/fixtures/metrics.py               |  1 +
 3 files changed, 29 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index e3b6f43bc4..62bf9acf01 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2204,6 +2204,15 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
     .expect("failed to define a metric"),
 });
 
+pub(crate) static PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_timeline_wal_records_received",
+        "Number of WAL records received per shard",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_wal_redo_seconds",
@@ -2431,6 +2440,7 @@ pub(crate) struct TimelineMetrics {
     pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
     /// Number of valid LSN leases.
     pub valid_lsn_lease_count_gauge: UIntGauge,
+    pub wal_records_received: IntCounter,
     shutdown: std::sync::atomic::AtomicBool,
 }
 
@@ -2588,6 +2598,10 @@ impl TimelineMetrics {
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
 
+        let wal_records_received = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
         TimelineMetrics {
             tenant_id,
             shard_id,
@@ -2620,6 +2634,7 @@ impl TimelineMetrics {
                 evictions_with_low_residence_duration,
             ),
             valid_lsn_lease_count_gauge,
+            wal_records_received,
             shutdown: std::sync::atomic::AtomicBool::default(),
         }
     }
@@ -2757,6 +2772,11 @@ impl TimelineMetrics {
             shard_id,
             timeline_id,
         ]);
+        let _ = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
     }
 }
 
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index d90ffbfa2c..3f10eeda60 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -369,6 +369,13 @@ pub(super) async fn handle_walreceiver_connection(
                 // advances it to its end LSN. 0 is just an initialization placeholder.
                 let mut modification = timeline.begin_modification(Lsn(0));
 
+                if !records.is_empty() {
+                    timeline
+                        .metrics
+                        .wal_records_received
+                        .inc_by(records.len() as u64);
+                }
+
                 for interpreted in records {
                     if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
                         && uncommitted_records > 0
@@ -510,6 +517,7 @@ pub(super) async fn handle_walreceiver_connection(
                         }
 
                         // Ingest the records without immediately committing them.
+                        timeline.metrics.wal_records_received.inc();
                         let ingested = walingest
                             .ingest_record(interpreted, &mut modification, &ctx)
                             .await
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 1278ed1aef..52ed7da36b 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -175,6 +175,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     counter("pageserver_tenant_throttling_count_accounted_finish"),
     counter("pageserver_tenant_throttling_wait_usecs_sum"),
     counter("pageserver_tenant_throttling_count"),
+    counter("pageserver_timeline_wal_records_received"),
     *histogram("pageserver_page_service_batch_size"),
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     # "pageserver_directory_entries_count", -- only used if above a certain threshold

From 14c4fae64af5613c682ec7dd7d30e484c476e5af Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 6 Dec 2024 16:17:15 +0100
Subject: [PATCH 17/26] test_runner/performance: add improved bulk insert
 benchmark (#9812)

Adds an improved bulk insert benchmark, including S3 uploads.

Touches #9789.
---
 test_runner/fixtures/pageserver/utils.py      |  22 +--
 .../performance/test_ingest_insert_bulk.py    | 142 ++++++++++++++++++
 2 files changed, 149 insertions(+), 15 deletions(-)
 create mode 100644 test_runner/performance/test_ingest_insert_bulk.py

diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 7c10edc5fc..66f61f9b4c 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -54,23 +54,15 @@ def wait_for_upload(
     tenant: TenantId | TenantShardId,
     timeline: TimelineId,
     lsn: Lsn,
+    timeout=20,
 ):
-    """waits for local timeline upload up to specified lsn"""
+    """Waits for local timeline upload up to specified LSN"""
 
-    current_lsn = Lsn(0)
-    for i in range(20):
-        current_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline)
-        if current_lsn >= lsn:
-            log.info("wait finished")
-            return
-        lr_lsn = last_record_lsn(pageserver_http, tenant, timeline)
-        log.info(
-            f"waiting for remote_consistent_lsn to reach {lsn}, now {current_lsn}, last_record_lsn={lr_lsn}, iteration {i + 1}"
-        )
-        time.sleep(1)
-    raise Exception(
-        f"timed out while waiting for {tenant}/{timeline} remote_consistent_lsn to reach {lsn}, was {current_lsn}"
-    )
+    def is_uploaded():
+        remote_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline)
+        assert remote_lsn >= lsn, f"remote_consistent_lsn at {remote_lsn}"
+
+    wait_until(is_uploaded, name=f"upload to {lsn}", timeout=timeout)
 
 
 def _tenant_in_expected_state(tenant_info: dict[str, Any], expected_state: str):
diff --git a/test_runner/performance/test_ingest_insert_bulk.py b/test_runner/performance/test_ingest_insert_bulk.py
new file mode 100644
index 0000000000..283bcada31
--- /dev/null
+++ b/test_runner/performance/test_ingest_insert_bulk.py
@@ -0,0 +1,142 @@
+from __future__ import annotations
+
+import random
+from concurrent.futures import ThreadPoolExecutor
+
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.common_types import Lsn
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.utils import (
+    wait_for_last_record_lsn,
+    wait_for_upload,
+    wait_for_upload_queue_empty,
+)
+from fixtures.remote_storage import s3_storage
+
+
+@pytest.mark.timeout(900)
+@pytest.mark.parametrize("size", [8, 1024, 8192])
+@pytest.mark.parametrize("s3", [True, False], ids=["s3", "local"])
+@pytest.mark.parametrize("backpressure", [True, False], ids=["backpressure", "nobackpressure"])
+@pytest.mark.parametrize("fsync", [True, False], ids=["fsync", "nofsync"])
+def test_ingest_insert_bulk(
+    request: pytest.FixtureRequest,
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    fsync: bool,
+    backpressure: bool,
+    s3: bool,
+    size: int,
+):
+    """
+    Benchmarks ingestion of 5 GB of sequential insert WAL. Measures ingestion and S3 upload
+    separately. Also does a Safekeeper→Pageserver re-ingestion to measure Pageserver ingestion in
+    isolation.
+    """
+
+    CONCURRENCY = 1  # 1 is optimal without fsync or backpressure
+    VOLUME = 5 * 1024**3
+    rows = VOLUME // (size + 64)  # +64 roughly accounts for per-row WAL overhead
+
+    neon_env_builder.safekeepers_enable_fsync = fsync
+
+    if s3:
+        neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+        # NB: don't use S3 for Safekeeper. It doesn't affect throughput (no backpressure), but it
+        # would compete with Pageserver for bandwidth.
+        # neon_env_builder.enable_safekeeper_remote_storage(s3_storage())
+
+    neon_env_builder.disable_scrub_on_exit()  # immediate shutdown may leave stray layers
+    env = neon_env_builder.init_start()
+
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            f"fsync = {fsync}",
+            "max_replication_apply_lag = 0",
+            f"max_replication_flush_lag = {'10GB' if backpressure else '0'}",
+            # NB: neon_local defaults to 15MB, which is too slow -- production uses 500MB.
+            f"max_replication_write_lag = {'500MB' if backpressure else '0'}",
+        ],
+    )
+    endpoint.safe_psql("create extension neon")
+
+    # Wait for the timeline to be propagated to the pageserver.
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
+
+    # Ingest rows.
+    log.info("Ingesting data")
+    start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+
+    def insert_rows(endpoint, table, count, value):
+        with endpoint.connect().cursor() as cur:
+            cur.execute("set statement_timeout = 0")
+            cur.execute(f"create table {table} (id int, data bytea)")
+            cur.execute(f"insert into {table} values (generate_series(1, {count}), %s)", (value,))
+
+    with zenbenchmark.record_duration("upload"):
+        with zenbenchmark.record_duration("ingest"):
+            with ThreadPoolExecutor(max_workers=CONCURRENCY) as pool:
+                for i in range(CONCURRENCY):
+                    # Write a random value for all rows. This is sufficient to prevent compression,
+                    # e.g. in TOAST. Randomly generating every row is too slow.
+                    value = random.randbytes(size)
+                    worker_rows = rows / CONCURRENCY
+                    pool.submit(insert_rows, endpoint, f"table{i}", worker_rows, value)
+
+        end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+
+        # Wait for pageserver to ingest the WAL.
+        client = env.pageserver.http_client()
+        wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn)
+
+        # Wait for pageserver S3 upload. Checkpoint to flush the last in-memory layer.
+        client.timeline_checkpoint(
+            env.initial_tenant,
+            env.initial_timeline,
+            compact=False,
+            wait_until_flushed=False,
+        )
+        wait_for_upload(client, env.initial_tenant, env.initial_timeline, end_lsn, timeout=600)
+
+    # Empty out upload queue for next benchmark.
+    wait_for_upload_queue_empty(client, env.initial_tenant, env.initial_timeline)
+
+    backpressure_time = endpoint.safe_psql("select backpressure_throttling_time()")[0][0]
+
+    # Now that all data is ingested, delete and recreate the tenant in the pageserver. This will
+    # reingest all the WAL directly from the safekeeper. This gives us a baseline of how fast the
+    # pageserver can ingest this WAL in isolation.
+    status = env.storage_controller.inspect(tenant_shard_id=env.initial_tenant)
+    assert status is not None
+
+    endpoint.stop()  # avoid spurious getpage errors
+    client.tenant_delete(env.initial_tenant)
+    env.pageserver.tenant_create(tenant_id=env.initial_tenant, generation=status[0])
+
+    with zenbenchmark.record_duration("recover"):
+        log.info("Recovering WAL into pageserver")
+        client.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline)
+        wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn)
+
+    # Emit metrics.
+    wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024))
+    zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
+    zenbenchmark.record("row_count", rows, "rows", MetricReport.TEST_PARAM)
+    zenbenchmark.record("concurrency", CONCURRENCY, "clients", MetricReport.TEST_PARAM)
+    zenbenchmark.record(
+        "backpressure_time", backpressure_time // 1000, "ms", MetricReport.LOWER_IS_BETTER
+    )
+
+    props = {p["name"]: p["value"] for _, p in request.node.user_properties}
+    for name in ("ingest", "upload", "recover"):
+        throughput = int(wal_written_mb / props[name])
+        zenbenchmark.record(f"{name}_throughput", throughput, "MB/s", MetricReport.HIGHER_IS_BETTER)
+
+    # Pageserver shutdown will likely get stuck on the upload queue, just shut it down immediately.
+    env.stop(immediate=True)

From e4837b0a5a65e8515949fad634d147cb2c2a8caf Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 6 Dec 2024 11:43:55 -0600
Subject: [PATCH 18/26] Bump sql_exporter to 0.16.0 (#10041)

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 build-tools.Dockerfile                      | 2 +-
 compute/compute-node.Dockerfile             | 2 +-
 test_runner/regress/test_compute_metrics.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 2671702697..fa84e467ad 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -115,7 +115,7 @@ RUN set -e \
 
 # Keep the version the same as in compute/compute-node.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-ENV SQL_EXPORTER_VERSION=0.13.1
+ENV SQL_EXPORTER_VERSION=0.16.0
 RUN curl -fsSL \
     "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
     --output sql_exporter.tar.gz \
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index bf6311bf2b..33d2a10285 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1324,7 +1324,7 @@ FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
 
 # Keep the version the same as in build-tools.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-FROM burningalchemist/sql_exporter:0.13.1 AS sql-exporter
+FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter
 
 #########################################################################################
 #
diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py
index 1b15c5f15e..787790103f 100644
--- a/test_runner/regress/test_compute_metrics.py
+++ b/test_runner/regress/test_compute_metrics.py
@@ -215,7 +215,7 @@ if SQL_EXPORTER is None:
             #
             # The "host" network mode allows sql_exporter to talk to the
             # endpoint which is running on the host.
-            super().__init__("docker.io/burningalchemist/sql_exporter:0.13.1", network_mode="host")
+            super().__init__("docker.io/burningalchemist/sql_exporter:0.16.0", network_mode="host")
 
             self.__logs_dir = logs_dir
             self.__port = port

From c42c28b339289a872400a4e9f0d1b4cc02048354 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 6 Dec 2024 13:44:26 -0500
Subject: [PATCH 19/26] feat(pageserver): gc-compaction split job and partial
 scheduler (#9897)

## Problem

part of https://github.com/neondatabase/neon/issues/9114, stacked PR
over #9809

The compaction scheduler now schedules partial compaction jobs.

## Summary of changes

* Add the compaction job splitter based on size.
* Schedule subcompactions using the compaction scheduler.
* Test subcompaction scheduler in the smoke regress test.
* Temporarily disable layer map checks

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs                |  10 +-
 pageserver/src/tenant.rs                     |  49 +++++-
 pageserver/src/tenant/timeline.rs            |   7 +
 pageserver/src/tenant/timeline/compaction.rs | 162 +++++++++++++++++--
 test_runner/regress/test_compaction.py       |   1 +
 5 files changed, 209 insertions(+), 20 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index b7fddb065c..0f11bbc507 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2036,15 +2036,23 @@ async fn timeline_compact_handler(
         parse_query_param::<_, bool>(&request, "wait_until_scheduled_compaction_done")?
             .unwrap_or(false);
 
+    let sub_compaction = compact_request
+        .as_ref()
+        .map(|r| r.sub_compaction)
+        .unwrap_or(false);
     let options = CompactOptions {
         compact_range: compact_request
             .as_ref()
             .and_then(|r| r.compact_range.clone()),
         compact_below_lsn: compact_request.as_ref().and_then(|r| r.compact_below_lsn),
         flags,
+        sub_compaction,
     };
 
-    let scheduled = compact_request.map(|r| r.scheduled).unwrap_or(false);
+    let scheduled = compact_request
+        .as_ref()
+        .map(|r| r.scheduled)
+        .unwrap_or(false);
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 306ec9f548..4a9c44aefd 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -49,6 +49,7 @@ use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
 use timeline::CompactFlags;
 use timeline::CompactOptions;
+use timeline::CompactionError;
 use timeline::ShutdownMode;
 use tokio::io::BufReader;
 use tokio::sync::watch;
@@ -2987,10 +2988,16 @@ impl Tenant {
                 if has_pending_l0_compaction_task {
                     Some(true)
                 } else {
-                    let has_pending_scheduled_compaction_task;
+                    let mut has_pending_scheduled_compaction_task;
                     let next_scheduled_compaction_task = {
                         let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
                         if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) {
+                            if !tline_pending_tasks.is_empty() {
+                                info!(
+                                    "{} tasks left in the compaction schedule queue",
+                                    tline_pending_tasks.len()
+                                );
+                            }
                             let next_task = tline_pending_tasks.pop_front();
                             has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty();
                             next_task
@@ -3007,6 +3014,32 @@ impl Tenant {
                             .contains(CompactFlags::EnhancedGcBottomMostCompaction)
                         {
                             warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
+                        } else if next_scheduled_compaction_task.options.sub_compaction {
+                            info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
+                            let jobs = timeline
+                                .gc_compaction_split_jobs(next_scheduled_compaction_task.options)
+                                .await
+                                .map_err(CompactionError::Other)?;
+                            if jobs.is_empty() {
+                                info!("no jobs to run, skipping scheduled compaction task");
+                            } else {
+                                has_pending_scheduled_compaction_task = true;
+                                let jobs_len = jobs.len();
+                                let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+                                let tline_pending_tasks = guard.entry(*timeline_id).or_default();
+                                for (idx, job) in jobs.into_iter().enumerate() {
+                                    tline_pending_tasks.push_back(ScheduledCompactionTask {
+                                        options: job,
+                                        result_tx: if idx == jobs_len - 1 {
+                                            // The last compaction job sends the completion signal
+                                            next_scheduled_compaction_task.result_tx.take()
+                                        } else {
+                                            None
+                                        },
+                                    });
+                                }
+                                info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
+                            }
                         } else {
                             let _ = timeline
                                 .compact_with_options(
@@ -9244,7 +9277,7 @@ mod tests {
                 CompactOptions {
                     flags: dryrun_flags,
                     compact_range: None,
-                    compact_below_lsn: None,
+                    ..Default::default()
                 },
                 &ctx,
             )
@@ -9481,7 +9514,7 @@ mod tests {
                 CompactOptions {
                     flags: dryrun_flags,
                     compact_range: None,
-                    compact_below_lsn: None,
+                    ..Default::default()
                 },
                 &ctx,
             )
@@ -9973,7 +10006,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_range: Some((get_key(0)..get_key(2)).into()),
-                    compact_below_lsn: None,
+                    ..Default::default()
                 },
                 &ctx,
             )
@@ -10020,7 +10053,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_range: Some((get_key(2)..get_key(4)).into()),
-                    compact_below_lsn: None,
+                    ..Default::default()
                 },
                 &ctx,
             )
@@ -10072,7 +10105,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_range: Some((get_key(4)..get_key(9)).into()),
-                    compact_below_lsn: None,
+                    ..Default::default()
                 },
                 &ctx,
             )
@@ -10123,7 +10156,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_range: Some((get_key(9)..get_key(10)).into()),
-                    compact_below_lsn: None,
+                    ..Default::default()
                 },
                 &ctx,
             )
@@ -10179,7 +10212,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_range: Some((get_key(0)..get_key(10)).into()),
-                    compact_below_lsn: None,
+                    ..Default::default()
                 },
                 &ctx,
             )
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0657d1af3a..8f1d5f6577 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -785,6 +785,9 @@ pub(crate) struct CompactRequest {
     /// Whether the compaction job should be scheduled.
     #[serde(default)]
     pub scheduled: bool,
+    /// Whether the compaction job should be split across key ranges.
+    #[serde(default)]
+    pub sub_compaction: bool,
 }
 
 #[serde_with::serde_as]
@@ -814,6 +817,9 @@ pub(crate) struct CompactOptions {
     /// If set, the compaction will only compact the LSN below this value.
     /// This option is only used by GC compaction.
     pub compact_below_lsn: Option<Lsn>,
+    /// Enable sub-compaction (split compaction job across key ranges).
+    /// This option is only used by GC compaction.
+    pub sub_compaction: bool,
 }
 
 impl std::fmt::Debug for Timeline {
@@ -1637,6 +1643,7 @@ impl Timeline {
                 flags,
                 compact_range: None,
                 compact_below_lsn: None,
+                sub_compaction: false,
             },
             ctx,
         )
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 7f86ede043..a18e157d37 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -10,8 +10,8 @@ use std::sync::Arc;
 
 use super::layer_manager::LayerManager;
 use super::{
-    CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
-    RecordedDuration, Timeline,
+    CompactFlags, CompactOptions, CompactRange, CreateImageLayersError, DurationRecorder,
+    ImageLayerCreationMode, RecordedDuration, Timeline,
 };
 
 use anyhow::{anyhow, bail, Context};
@@ -29,7 +29,6 @@ use utils::id::TimelineId;
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
 use crate::statvfs::Statvfs;
-use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::batch_split_writer::{
     BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -1752,6 +1751,116 @@ impl Timeline {
         Ok(())
     }
 
+    /// Split a gc-compaction job into multiple compaction jobs. Optimally, this function should return a vector of
+    /// `GcCompactionJobDesc`. But we want to keep it simple on the tenant scheduling side without exposing too much
+    /// ad-hoc information about gc compaction itself.
+    pub(crate) async fn gc_compaction_split_jobs(
+        self: &Arc<Self>,
+        options: CompactOptions,
+    ) -> anyhow::Result<Vec<CompactOptions>> {
+        if !options.sub_compaction {
+            return Ok(vec![options]);
+        }
+        let compact_range = options.compact_range.clone().unwrap_or(CompactRange {
+            start: Key::MIN,
+            end: Key::MAX,
+        });
+        let compact_below_lsn = if let Some(compact_below_lsn) = options.compact_below_lsn {
+            compact_below_lsn
+        } else {
+            let gc_info = self.gc_info.read().unwrap();
+            gc_info.cutoffs.select_min() // use the real gc cutoff
+        };
+        let mut compact_jobs = Vec::new();
+        // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
+        // by estimating the amount of files read for a compaction job. We should also partition on LSN.
+        let Ok(partition) = self.partitioning.try_lock() else {
+            bail!("failed to acquire partition lock");
+        };
+        let ((dense_ks, sparse_ks), _) = &*partition;
+        // Truncate the key range to be within user specified compaction range.
+        fn truncate_to(
+            source_start: &Key,
+            source_end: &Key,
+            target_start: &Key,
+            target_end: &Key,
+        ) -> Option<(Key, Key)> {
+            let start = source_start.max(target_start);
+            let end = source_end.min(target_end);
+            if start < end {
+                Some((*start, *end))
+            } else {
+                None
+            }
+        }
+        let mut split_key_ranges = Vec::new();
+        let ranges = dense_ks
+            .parts
+            .iter()
+            .map(|partition| partition.ranges.iter())
+            .chain(sparse_ks.parts.iter().map(|x| x.0.ranges.iter()))
+            .flatten()
+            .cloned()
+            .collect_vec();
+        for range in ranges.iter() {
+            let Some((start, end)) = truncate_to(
+                &range.start,
+                &range.end,
+                &compact_range.start,
+                &compact_range.end,
+            ) else {
+                continue;
+            };
+            split_key_ranges.push((start, end));
+        }
+        split_key_ranges.sort();
+        let guard = self.layers.read().await;
+        let layer_map = guard.layer_map()?;
+        let mut current_start = None;
+        // Split compaction job to about 2GB each
+        const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024; // 4GB, TODO: should be configuration in the future
+        let ranges_num = split_key_ranges.len();
+        for (idx, (start, end)) in split_key_ranges.into_iter().enumerate() {
+            if current_start.is_none() {
+                current_start = Some(start);
+            }
+            let start = current_start.unwrap();
+            if start >= end {
+                // We have already processed this partition.
+                continue;
+            }
+            let res = layer_map.range_search(start..end, compact_below_lsn);
+            let total_size = res.found.keys().map(|x| x.layer.file_size()).sum::<u64>();
+            if total_size > GC_COMPACT_MAX_SIZE_MB * 1024 * 1024 || ranges_num == idx + 1 {
+                let mut compact_options = options.clone();
+                // Try to extend the compaction range so that we include at least one full layer file.
+                let extended_end = res
+                    .found
+                    .keys()
+                    .map(|layer| layer.layer.key_range.end)
+                    .min();
+                // It is possible that the search range does not contain any layer files when we reach the end of the loop.
+                // In this case, we simply use the specified key range end.
+                let end = if let Some(extended_end) = extended_end {
+                    extended_end.max(end)
+                } else {
+                    end
+                };
+                info!(
+                    "splitting compaction job: {}..{}, estimated_size={}",
+                    start, end, total_size
+                );
+                compact_options.compact_range = Some(CompactRange { start, end });
+                compact_options.compact_below_lsn = Some(compact_below_lsn);
+                compact_options.sub_compaction = false;
+                compact_jobs.push(compact_options);
+                current_start = Some(end);
+            }
+        }
+        drop(guard);
+        Ok(compact_jobs)
+    }
+
     /// An experimental compaction building block that combines compaction with garbage collection.
     ///
     /// The current implementation picks all delta + image layers that are below or intersecting with
@@ -1774,6 +1883,36 @@ impl Timeline {
         options: CompactOptions,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
+        if options.sub_compaction {
+            info!("running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
+            let jobs = self.gc_compaction_split_jobs(options).await?;
+            let jobs_len = jobs.len();
+            for (idx, job) in jobs.into_iter().enumerate() {
+                info!(
+                    "running enhanced gc bottom-most compaction, sub-compaction {}/{}",
+                    idx + 1,
+                    jobs_len
+                );
+                self.compact_with_gc_inner(cancel, job, ctx).await?;
+            }
+            if jobs_len == 0 {
+                info!("no jobs to run, skipping gc bottom-most compaction");
+            }
+            return Ok(());
+        }
+        self.compact_with_gc_inner(cancel, options, ctx).await
+    }
+
+    async fn compact_with_gc_inner(
+        self: &Arc<Self>,
+        cancel: &CancellationToken,
+        options: CompactOptions,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        assert!(
+            !options.sub_compaction,
+            "sub-compaction should be handled by the outer function"
+        );
         // Block other compaction/GC tasks from running for now. GC-compaction could run along
         // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
         // Note that we already acquired the compaction lock when the outer `compact` function gets called.
@@ -1943,14 +2082,15 @@ impl Timeline {
 
         // Step 1: construct a k-merge iterator over all layers.
         // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
-        let layer_names = job_desc
-            .selected_layers
-            .iter()
-            .map(|layer| layer.layer_desc().layer_name())
-            .collect_vec();
-        if let Some(err) = check_valid_layermap(&layer_names) {
-            warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
-        }
+        // disable the check for now because we need to adjust the check for partial compactions, will enable later.
+        // let layer_names = job_desc
+        //     .selected_layers
+        //     .iter()
+        //     .map(|layer| layer.layer_desc().layer_name())
+        //     .collect_vec();
+        // if let Some(err) = check_valid_layermap(&layer_names) {
+        //     warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
+        // }
         // The maximum LSN we are processing in this compaction loop
         let end_lsn = job_desc
             .selected_layers
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index de6653eb3f..e92dc47f39 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -159,6 +159,7 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
             enhanced_gc_bottom_most_compaction=True,
             body={
                 "scheduled": True,
+                "sub_compaction": True,
                 "compact_range": {
                     "start": "000000000000000000000000000000000000",
                     # skip the SLRU range for now -- it races with get-lsn-by-timestamp, TODO: fix this

From b6eea655976ad7ebffd9b7edbf193850d2b2b05b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 6 Dec 2024 22:56:57 +0200
Subject: [PATCH 20/26] Fix error message if PS connection is lost while
 receiving prefetch (#9923)

If the pageserver connection is lost while receiving the prefetch
request, the prefetch queue is cleared. The error message prints the
values from the prefetch slot, but because the slot was already cleared,
they're all zeros:

LOG: [NEON_SMGR] [shard 0] No response from reading prefetch entry 0:
0/0/0.0 block 0. This can be caused by a concurrent disconnect

To fix, make local copies of the values.

In the passing, also add a sanity check that if the receive() call
succeeds, the prefetch slot is still intact.
---
 pgxn/neon/pagestore_smgr.c | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index a5e0c402fb..880c0de64e 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -610,6 +610,9 @@ prefetch_read(PrefetchRequest *slot)
 {
 	NeonResponse *response;
 	MemoryContext old;
+	BufferTag	buftag;
+	shardno_t	shard_no;
+	uint64		my_ring_index;
 
 	Assert(slot->status == PRFS_REQUESTED);
 	Assert(slot->response == NULL);
@@ -623,11 +626,29 @@ prefetch_read(PrefetchRequest *slot)
 					   slot->status, slot->response,
 					   (long)slot->my_ring_index, (long)MyPState->ring_receive);
 
+	/*
+	 * Copy the request info so that if an error happens and the prefetch
+	 * queue is flushed during the receive call, we can print the original
+	 * values in the error message
+	 */
+	buftag = slot->buftag;
+	shard_no = slot->shard_no;
+	my_ring_index = slot->my_ring_index;
+
 	old = MemoryContextSwitchTo(MyPState->errctx);
-	response = (NeonResponse *) page_server->receive(slot->shard_no);
+	response = (NeonResponse *) page_server->receive(shard_no);
 	MemoryContextSwitchTo(old);
 	if (response)
 	{
+		/* The slot should still be valid */
+		if (slot->status != PRFS_REQUESTED ||
+			slot->response != NULL ||
+			slot->my_ring_index != MyPState->ring_receive)
+			neon_shard_log(shard_no, ERROR,
+						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
+						   slot->status, slot->response,
+						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
+
 		/* update prefetch state */
 		MyPState->n_responses_buffered += 1;
 		MyPState->n_requests_inflight -= 1;
@@ -642,11 +663,15 @@ prefetch_read(PrefetchRequest *slot)
 	}
 	else
 	{
-		neon_shard_log(slot->shard_no, LOG,
+		/*
+		 * Note: The slot might no longer be valid, if the connection was lost
+		 * and the prefetch queue was flushed during the receive call
+		 */
+		neon_shard_log(shard_no, LOG,
 					   "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
-					   (long)slot->my_ring_index,
-					   RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
-					   slot->buftag.forkNum, slot->buftag.blockNum);
+					   (long) my_ring_index,
+					   RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)),
+					   buftag.forkNum, buftag.blockNum);
 		return false;
 	}
 }

From b1fd086c0c974447376d23cd6e3baf4f8248a1ce Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 6 Dec 2024 17:30:04 -0500
Subject: [PATCH 21/26] test(pageserver): disable gc_compaction smoke test for
 now (#10045)

## Problem

The test is flaky.

## Summary of changes

Disable the test.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_compaction.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index e92dc47f39..881503046c 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -121,6 +121,9 @@ page_cache_size=10
     assert vectored_average < 8
 
 
+@pytest.mark.skip(
+    "This is being fixed and tracked in https://github.com/neondatabase/neon/issues/9114"
+)
 @skip_in_debug_build("only run with release build")
 def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
     SMOKE_CONF = {

From 4d7111f240062e161af1c298ffc5c28b5ed695fe Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Sat, 7 Dec 2024 09:57:55 +0100
Subject: [PATCH 22/26] page_service: don't count time spent flushing towards
 smgr latency metrics (#10042)

## Problem

In #9962 I changed the smgr metrics to include time spent on flush.

It isn't under our (=storage team's) control how long that flush takes
because the client can stop reading requests.

## Summary of changes

Stop the timer as soon as we've buffered up the response in the
`pgb_writer`.

Track flush time in a separate metric.

---------

Co-authored-by: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
---
 pageserver/src/metrics.rs       | 138 +++++++++++++++++++++++++++++---
 pageserver/src/page_service.rs  |  76 ++++++++++++------
 test_runner/fixtures/metrics.py |   1 +
 3 files changed, 179 insertions(+), 36 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 62bf9acf01..96ee157856 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1223,31 +1223,60 @@ pub(crate) mod virtual_file_io_engine {
     });
 }
 
-pub(crate) struct SmgrOpTimer {
+pub(crate) struct SmgrOpTimer(Option<SmgrOpTimerInner>);
+pub(crate) struct SmgrOpTimerInner {
     global_latency_histo: Histogram,
 
     // Optional because not all op types are tracked per-timeline
     per_timeline_latency_histo: Option<Histogram>,
 
+    global_flush_in_progress_micros: IntCounter,
+    per_timeline_flush_in_progress_micros: IntCounter,
+
     start: Instant,
     throttled: Duration,
     op: SmgrQueryType,
 }
 
+pub(crate) struct SmgrOpFlushInProgress {
+    base: Instant,
+    global_micros: IntCounter,
+    per_timeline_micros: IntCounter,
+}
+
 impl SmgrOpTimer {
     pub(crate) fn deduct_throttle(&mut self, throttle: &Option<Duration>) {
         let Some(throttle) = throttle else {
             return;
         };
-        self.throttled += *throttle;
+        let inner = self.0.as_mut().expect("other public methods consume self");
+        inner.throttled += *throttle;
     }
-}
 
-impl Drop for SmgrOpTimer {
-    fn drop(&mut self) {
-        let elapsed = self.start.elapsed();
+    pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
+        let (flush_start, inner) = self
+            .smgr_op_end()
+            .expect("this method consume self, and the only other caller is drop handler");
+        let SmgrOpTimerInner {
+            global_flush_in_progress_micros,
+            per_timeline_flush_in_progress_micros,
+            ..
+        } = inner;
+        SmgrOpFlushInProgress {
+            base: flush_start,
+            global_micros: global_flush_in_progress_micros,
+            per_timeline_micros: per_timeline_flush_in_progress_micros,
+        }
+    }
 
-        let elapsed = match elapsed.checked_sub(self.throttled) {
+    /// Returns `None`` if this method has already been called, `Some` otherwise.
+    fn smgr_op_end(&mut self) -> Option<(Instant, SmgrOpTimerInner)> {
+        let inner = self.0.take()?;
+
+        let now = Instant::now();
+        let elapsed = now - inner.start;
+
+        let elapsed = match elapsed.checked_sub(inner.throttled) {
             Some(elapsed) => elapsed,
             None => {
                 use utils::rate_limit::RateLimit;
@@ -1258,9 +1287,9 @@ impl Drop for SmgrOpTimer {
                         })))
                     });
                 let mut guard = LOGGED.lock().unwrap();
-                let rate_limit = &mut guard[self.op];
+                let rate_limit = &mut guard[inner.op];
                 rate_limit.call(|| {
-                    warn!(op=?self.op, ?elapsed, ?self.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
+                    warn!(op=?inner.op, ?elapsed, ?inner.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
                 });
                 elapsed // un-throttled time, more info than just saturating to 0
             }
@@ -1268,10 +1297,54 @@ impl Drop for SmgrOpTimer {
 
         let elapsed = elapsed.as_secs_f64();
 
-        self.global_latency_histo.observe(elapsed);
-        if let Some(per_timeline_getpage_histo) = &self.per_timeline_latency_histo {
+        inner.global_latency_histo.observe(elapsed);
+        if let Some(per_timeline_getpage_histo) = &inner.per_timeline_latency_histo {
             per_timeline_getpage_histo.observe(elapsed);
         }
+
+        Some((now, inner))
+    }
+}
+
+impl Drop for SmgrOpTimer {
+    fn drop(&mut self) {
+        self.smgr_op_end();
+    }
+}
+
+impl SmgrOpFlushInProgress {
+    pub(crate) async fn measure<Fut, O>(mut self, mut fut: Fut) -> O
+    where
+        Fut: std::future::Future<Output = O>,
+    {
+        let mut fut = std::pin::pin!(fut);
+
+        let now = Instant::now();
+        // Whenever observe_guard gets called, or dropped,
+        // it adds the time elapsed since its last call to metrics.
+        // Last call is tracked in `now`.
+        let mut observe_guard = scopeguard::guard(
+            || {
+                let elapsed = now - self.base;
+                self.global_micros
+                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
+                self.per_timeline_micros
+                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
+                self.base = now;
+            },
+            |mut observe| {
+                observe();
+            },
+        );
+
+        loop {
+            match tokio::time::timeout(Duration::from_secs(10), &mut fut).await {
+                Ok(v) => return v,
+                Err(_timeout) => {
+                    (*observe_guard)();
+                }
+            }
+        }
     }
 }
 
@@ -1302,6 +1375,8 @@ pub(crate) struct SmgrQueryTimePerTimeline {
     per_timeline_getpage_latency: Histogram,
     global_batch_size: Histogram,
     per_timeline_batch_size: Histogram,
+    global_flush_in_progress_micros: IntCounter,
+    per_timeline_flush_in_progress_micros: IntCounter,
 }
 
 static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -1464,6 +1539,26 @@ fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
         .set(value.try_into().unwrap());
 }
 
+static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_page_service_pagestream_flush_in_progress_micros",
+        "Counter that sums up the microseconds that a pagestream response was being flushed into the TCP connection. \
+         If the flush is particularly slow, this counter will be updated periodically to make slow flushes \
+         easily discoverable in monitoring. \
+         Hence, this is NOT a completion latency historgram.",
+        &["tenant_id", "shard_id", "timeline_id"],
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_page_service_pagestream_flush_in_progress_micros_global",
+        "Like pageserver_page_service_pagestream_flush_in_progress_seconds, but instance-wide.",
+    )
+    .expect("failed to define a metric")
+});
+
 impl SmgrQueryTimePerTimeline {
     pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
         let tenant_id = tenant_shard_id.tenant_id.to_string();
@@ -1504,6 +1599,12 @@ impl SmgrQueryTimePerTimeline {
             .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
             .unwrap();
 
+        let global_flush_in_progress_micros =
+            PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
+        let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
+            .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
+            .unwrap();
+
         Self {
             global_started,
             global_latency,
@@ -1511,6 +1612,8 @@ impl SmgrQueryTimePerTimeline {
             per_timeline_getpage_started,
             global_batch_size,
             per_timeline_batch_size,
+            global_flush_in_progress_micros,
+            per_timeline_flush_in_progress_micros,
         }
     }
     pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, started_at: Instant) -> SmgrOpTimer {
@@ -1523,13 +1626,17 @@ impl SmgrQueryTimePerTimeline {
             None
         };
 
-        SmgrOpTimer {
+        SmgrOpTimer(Some(SmgrOpTimerInner {
             global_latency_histo: self.global_latency[op as usize].clone(),
             per_timeline_latency_histo,
             start: started_at,
             op,
             throttled: Duration::ZERO,
-        }
+            global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
+            per_timeline_flush_in_progress_micros: self
+                .per_timeline_flush_in_progress_micros
+                .clone(),
+        }))
     }
 
     pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
@@ -2777,6 +2884,11 @@ impl TimelineMetrics {
             shard_id,
             timeline_id,
         ]);
+        let _ = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
     }
 }
 
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 7026df9527..97d94bbe7f 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1017,10 +1017,8 @@ impl PageServerHandler {
         // Map handler result to protocol behavior.
         // Some handler errors cause exit from pagestream protocol.
         // Other handler errors are sent back as an error message and we stay in pagestream protocol.
-        let mut timers: smallvec::SmallVec<[_; 1]> =
-            smallvec::SmallVec::with_capacity(handler_results.len());
         for handler_result in handler_results {
-            let response_msg = match handler_result {
+            let (response_msg, timer) = match handler_result {
                 Err(e) => match &e {
                     PageStreamError::Shutdown => {
                         // If we fail to fulfil a request during shutdown, which may be _because_ of
@@ -1044,34 +1042,66 @@ impl PageServerHandler {
                         span.in_scope(|| {
                             error!("error reading relation or page version: {full:#}")
                         });
-                        PagestreamBeMessage::Error(PagestreamErrorResponse {
-                            message: e.to_string(),
-                        })
+                        (
+                            PagestreamBeMessage::Error(PagestreamErrorResponse {
+                                message: e.to_string(),
+                            }),
+                            None, // TODO: measure errors
+                        )
                     }
                 },
-                Ok((response_msg, timer)) => {
-                    // Extending the lifetime of the timers so observations on drop
-                    // include the flush time.
-                    timers.push(timer);
-                    response_msg
-                }
+                Ok((response_msg, timer)) => (response_msg, Some(timer)),
             };
 
+            //
             // marshal & transmit response message
+            //
+
             pgb_writer.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
-        }
-        tokio::select! {
-            biased;
-            _ = cancel.cancelled() => {
-                // We were requested to shut down.
-                info!("shutdown request received in page handler");
-                return Err(QueryError::Shutdown)
-            }
-            res = pgb_writer.flush() => {
-                res?;
+
+            // We purposefully don't count flush time into the timer.
+            //
+            // The reason is that current compute client will not perform protocol processing
+            // if the postgres backend process is doing things other than `->smgr_read()`.
+            // This is especially the case for prefetch.
+            //
+            // If the compute doesn't read from the connection, eventually TCP will backpressure
+            // all the way into our flush call below.
+            //
+            // The timer's underlying metric is used for a storage-internal latency SLO and
+            // we don't want to include latency in it that we can't control.
+            // And as pointed out above, in this case, we don't control the time that flush will take.
+            let flushing_timer =
+                timer.map(|timer| timer.observe_smgr_op_completion_and_start_flushing());
+
+            // what we want to do
+            let flush_fut = pgb_writer.flush();
+            // metric for how long flushing takes
+            let flush_fut = match flushing_timer {
+                Some(flushing_timer) => {
+                    futures::future::Either::Left(flushing_timer.measure(flush_fut))
+                }
+                None => futures::future::Either::Right(flush_fut),
+            };
+            // do it while respecting cancellation
+            let _: () = async move {
+                tokio::select! {
+                    biased;
+                    _ = cancel.cancelled() => {
+                        // We were requested to shut down.
+                        info!("shutdown request received in page handler");
+                        return Err(QueryError::Shutdown)
+                    }
+                    res = flush_fut => {
+                        res?;
+                    }
+                }
+                Ok(())
             }
+            // and log the info! line inside the request span
+            .instrument(span.clone())
+            .await?;
         }
-        drop(timers);
         Ok(())
     }
 
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 52ed7da36b..a591e088ef 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -176,6 +176,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     counter("pageserver_tenant_throttling_wait_usecs_sum"),
     counter("pageserver_tenant_throttling_count"),
     counter("pageserver_timeline_wal_records_received"),
+    counter("pageserver_page_service_pagestream_flush_in_progress_micros"),
     *histogram("pageserver_page_service_batch_size"),
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     # "pageserver_directory_entries_count", -- only used if above a certain threshold

From ec790870d54aadd1ecc6e431c9049b489ba33cd1 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Sat, 7 Dec 2024 13:05:09 +0000
Subject: [PATCH 23/26] storcon: automatically clear Pause/Stop scheduling
 policies to enable detaches (#10011)

## Problem

We saw a tenant get stuck when it had been put into Pause scheduling
mode to pin it to a pageserver, then it was left idle for a while and
the control plane tried to detach it.

Close: https://github.com/neondatabase/neon/issues/9957

## Summary of changes

- When changing policy to Detached or Secondary, set the scheduling
policy to Active.
- Add a test that exercises this
- When persisting tenant shards, set their `generation_pageserver` to
null if the placement policy is not Attached (this enables consistency
checks to work, and avoids leaving state in the DB that could be
confusing/misleading in future)
---
 libs/pageserver_api/src/controller_api.rs     | 11 ++++
 storage_controller/src/persistence.rs         |  9 ++++
 storage_controller/src/service.rs             | 39 +++++++++++++-
 .../regress/test_storage_controller.py        | 52 +++++++++++++++++++
 4 files changed, 109 insertions(+), 2 deletions(-)

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 9a5ebc95bd..6839ef69f5 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -245,6 +245,17 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
     }
 }
 
+/// Scheduling policy enables us to selectively disable some automatic actions that the
+/// controller performs on a tenant shard. This is only set to a non-default value by
+/// human intervention, and it is reset to the default value (Active) when the tenant's
+/// placement policy is modified away from Attached.
+///
+/// The typical use of a non-Active scheduling policy is one of:
+/// - Pinnning a shard to a node (i.e. migrating it there & setting a non-Active scheduling policy)
+/// - Working around a bug (e.g. if something is flapping and we need to stop it until the bug is fixed)
+///
+/// If you're not sure which policy to use to pin a shard to its current location, you probably
+/// want Pause.
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
 pub enum ShardSchedulingPolicy {
     // Normal mode: the tenant's scheduled locations may be updated at will, including
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 14cc51240d..7ca80c7dfe 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -636,6 +636,13 @@ impl Persistence {
                     .into_boxed(),
             };
 
+            // Clear generation_pageserver if we are moving into a state where we won't have
+            // any attached pageservers.
+            let input_generation_pageserver = match input_placement_policy {
+                None | Some(PlacementPolicy::Attached(_)) => None,
+                Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
+            };
+
             #[derive(AsChangeset)]
             #[diesel(table_name = crate::schema::tenant_shards)]
             struct ShardUpdate {
@@ -643,6 +650,7 @@ impl Persistence {
                 placement_policy: Option<String>,
                 config: Option<String>,
                 scheduling_policy: Option<String>,
+                generation_pageserver: Option<Option<i64>>,
             }
 
             let update = ShardUpdate {
@@ -655,6 +663,7 @@ impl Persistence {
                     .map(|c| serde_json::to_string(&c).unwrap()),
                 scheduling_policy: input_scheduling_policy
                     .map(|p| serde_json::to_string(&p).unwrap()),
+                generation_pageserver: input_generation_pageserver,
             };
 
             query.set(update).execute(conn)?;
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 083c78233a..7e4ee53b4c 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -513,6 +513,9 @@ struct ShardUpdate {
 
     /// If this is None, generation is not updated.
     generation: Option<Generation>,
+
+    /// If this is None, scheduling policy is not updated.
+    scheduling_policy: Option<ShardSchedulingPolicy>,
 }
 
 enum StopReconciliationsReason {
@@ -2376,6 +2379,23 @@ impl Service {
             }
         };
 
+        // Ordinarily we do not update scheduling policy, but when making major changes
+        // like detaching or demoting to secondary-only, we need to force the scheduling
+        // mode to Active, or the caller's expected outcome (detach it) will not happen.
+        let scheduling_policy = match req.config.mode {
+            LocationConfigMode::Detached | LocationConfigMode::Secondary => {
+                // Special case: when making major changes like detaching or demoting to secondary-only,
+                // we need to force the scheduling mode to Active, or nothing will happen.
+                Some(ShardSchedulingPolicy::Active)
+            }
+            LocationConfigMode::AttachedMulti
+            | LocationConfigMode::AttachedSingle
+            | LocationConfigMode::AttachedStale => {
+                // While attached, continue to respect whatever the existing scheduling mode is.
+                None
+            }
+        };
+
         let mut create = true;
         for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
             // Saw an existing shard: this is not a creation
@@ -2401,6 +2421,7 @@ impl Service {
                 placement_policy: placement_policy.clone(),
                 tenant_config: req.config.tenant_conf.clone(),
                 generation: set_generation,
+                scheduling_policy,
             });
         }
 
@@ -2497,6 +2518,7 @@ impl Service {
                     placement_policy,
                     tenant_config,
                     generation,
+                    scheduling_policy,
                 } in &updates
                 {
                     self.persistence
@@ -2505,7 +2527,7 @@ impl Service {
                             Some(placement_policy.clone()),
                             Some(tenant_config.clone()),
                             *generation,
-                            None,
+                            *scheduling_policy,
                         )
                         .await?;
                 }
@@ -2521,6 +2543,7 @@ impl Service {
                         placement_policy,
                         tenant_config,
                         generation: update_generation,
+                        scheduling_policy,
                     } in updates
                     {
                         let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
@@ -2539,6 +2562,10 @@ impl Service {
                             shard.generation = Some(generation);
                         }
 
+                        if let Some(scheduling_policy) = scheduling_policy {
+                            shard.set_scheduling_policy(scheduling_policy);
+                        }
+
                         shard.schedule(scheduler, &mut schedule_context)?;
 
                         let maybe_waiter = self.maybe_reconcile_shard(shard, nodes);
@@ -2992,9 +3019,17 @@ impl Service {
 
         let TenantPolicyRequest {
             placement,
-            scheduling,
+            mut scheduling,
         } = req;
 
+        if let Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) = placement {
+            // When someone configures a tenant to detach, we force the scheduling policy to enable
+            // this to take effect.
+            if scheduling.is_none() {
+                scheduling = Some(ShardSchedulingPolicy::Active);
+            }
+        }
+
         self.persistence
             .update_tenant_shard(
                 TenantFilter::Tenant(tenant_id),
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index f878116d53..9f74dcccb9 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3230,3 +3230,55 @@ def test_multi_attached_timeline_creation(neon_env_builder: NeonEnvBuilder, migr
         # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown
         env.storage_controller.configure_failpoints((migration_failpoint.value, "off"))
         raise
+
+
+@run_only_on_default_postgres("Postgres version makes no difference here")
+def test_storage_controller_detached_stopped(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Test that detaching a tenant while it has scheduling policy set to Paused or Stop works
+    """
+
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    neon_env_builder.num_pageservers = 1
+
+    env = neon_env_builder.init_configs()
+    env.start()
+    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
+
+    tenant_id = TenantId.generate()
+    env.storage_controller.tenant_create(
+        tenant_id,
+        shard_count=1,
+    )
+
+    assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1
+
+    # Disable scheduling: ordinarily this would prevent the tenant's configuration being
+    # reconciled to pageservers, but this should be overridden when detaching.
+    env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy.*")
+    env.storage_controller.tenant_policy_update(
+        tenant_id,
+        {"scheduling": "Stop"},
+    )
+
+    env.storage_controller.consistency_check()
+
+    # Detach the tenant
+    virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": None,
+        },
+    )
+
+    env.storage_controller.consistency_check()
+
+    # Confirm the detach happened
+    assert env.pageserver.http_client().tenant_list_locations()["tenant_shards"] == []

From 9d425b54f72f7baa23f63c09c464d18c10304fa8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 9 Dec 2024 13:46:59 +0100
Subject: [PATCH 24/26] Update AWS SDK crates (#10056)

Result of running:

cargo update -p aws-types -p aws-sigv4 -p aws-credential-types -p
aws-smithy-types -p aws-smithy-async -p aws-sdk-kms -p aws-sdk-iam -p
aws-sdk-s3 -p aws-config

We want to keep the AWS SDK up to date as that way we benefit from new
developments and improvements.
---
 Cargo.lock | 77 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 42 insertions(+), 35 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f6e0024d87..a25c7585bb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -284,9 +284,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
 name = "aws-config"
-version = "1.5.1"
+version = "1.5.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2ac9889352d632214df943e26740c46a0f3da6e329fbd28164fe7ae1b061da7b"
+checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -295,7 +295,7 @@ dependencies = [
  "aws-sdk-sts",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.60.7",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -304,7 +304,6 @@ dependencies = [
  "fastrand 2.2.0",
  "hex",
  "http 0.2.9",
- "hyper 0.14.30",
  "ring",
  "time",
  "tokio",
@@ -327,9 +326,9 @@ dependencies = [
 
 [[package]]
 name = "aws-runtime"
-version = "1.4.3"
+version = "1.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a10d5c055aa540164d9561a0e2e74ad30f0dcf7393c3a92f6733ddf9c5762468"
+checksum = "b5ac934720fbb46206292d2c75b57e67acfc56fe7dfd34fb9a02334af08409ea"
 dependencies = [
  "aws-credential-types",
  "aws-sigv4",
@@ -353,15 +352,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-iam"
-version = "1.46.0"
+version = "1.53.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "053df3024ea2ed0431359b3cddecc92dcfadeaedf71dd497292b39e37e597b46"
+checksum = "fb8a6fea8d335cde419176b1f2c6d2d6e97997719e7df4b51e59064310f48e4a"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-query",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
@@ -376,15 +375,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-kms"
-version = "1.47.0"
+version = "1.51.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "564a597a3c71a957d60a2e4c62c93d78ee5a0d636531e15b760acad983a5c18e"
+checksum = "3c30f6fd5646b99d9b45ec3a0c22e67112c175b2383100c960d7ee39d96c8d96"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -398,11 +397,10 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-s3"
-version = "1.52.0"
+version = "1.65.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f571deb0a80c20d21d9f3e8418c1712af9ff4bf399d057e5549a934eca4844e2"
+checksum = "d3ba2c5c0f2618937ce3d4a5ad574b86775576fa24006bcb3128c6e2cbf3c34e"
 dependencies = [
- "ahash",
  "aws-credential-types",
  "aws-runtime",
  "aws-sigv4",
@@ -410,7 +408,7 @@ dependencies = [
  "aws-smithy-checksums",
  "aws-smithy-eventstream",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -433,15 +431,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sso"
-version = "1.30.0"
+version = "1.50.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ebb97e44983752cf7e12968c5f569a5d7562dbbc67006755c331d9d9c99580ae"
+checksum = "05ca43a4ef210894f93096039ef1d6fa4ad3edfabb3be92b80908b9f2e4b4eab"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -455,15 +453,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-ssooidc"
-version = "1.31.0"
+version = "1.51.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad061d977235898e4a97ecbd5d882786cca41b4828943584dc792dcc35eb3d3c"
+checksum = "abaf490c2e48eed0bb8e2da2fb08405647bd7f253996e0f93b981958ea0f73b0"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -477,15 +475,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sts"
-version = "1.30.0"
+version = "1.51.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "300ce43d1f7f4eb023e57d38b0921d964e8e62bed7f82f6b7849e7eab7a14575"
+checksum = "b68fde0d69c8bfdc1060ea7da21df3e39f6014da316783336deff0a9ec28f4bf"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-query",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
@@ -500,9 +498,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sigv4"
-version = "1.2.4"
+version = "1.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc8db6904450bafe7473c6ca9123f88cc11089e41a025408f992db4e22d3be68"
+checksum = "7d3820e0c08d0737872ff3c7c1f21ebbb6693d832312d6152bf18ef50a5471c2"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-eventstream",
@@ -540,9 +538,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-checksums"
-version = "0.60.12"
+version = "0.60.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "598b1689d001c4d4dc3cb386adb07d37786783aee3ac4b324bcadac116bf3d23"
+checksum = "ba1a71073fca26775c8b5189175ea8863afb1c9ea2cceb02a5de5ad9dfbaa795"
 dependencies = [
  "aws-smithy-http",
  "aws-smithy-types",
@@ -600,6 +598,15 @@ dependencies = [
  "aws-smithy-types",
 ]
 
+[[package]]
+name = "aws-smithy-json"
+version = "0.61.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee4e69cc50921eb913c6b662f8d909131bb3e6ad6cb6090d3a39b66fc5c52095"
+dependencies = [
+ "aws-smithy-types",
+]
+
 [[package]]
 name = "aws-smithy-query"
 version = "0.60.7"
@@ -612,9 +619,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime"
-version = "1.7.2"
+version = "1.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a065c0fe6fdbdf9f11817eb68582b2ab4aff9e9c39e986ae48f7ec576c6322db"
+checksum = "9f20685047ca9d6f17b994a07f629c813f08b5bce65523e47124879e60103d45"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-http",
@@ -639,9 +646,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime-api"
-version = "1.7.2"
+version = "1.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e086682a53d3aa241192aa110fa8dfce98f2f5ac2ead0de84d41582c7e8fdb96"
+checksum = "92165296a47a812b267b4f41032ff8069ab7ff783696d217f0994a0d7ab585cd"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-types",
@@ -656,9 +663,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-types"
-version = "1.2.7"
+version = "1.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "147100a7bea70fa20ef224a6bad700358305f5dc0f84649c53769761395b355b"
+checksum = "4fbd94a32b3a7d55d3806fe27d98d3ad393050439dd05eb53ece36ec5e3d3510"
 dependencies = [
  "base64-simd",
  "bytes",

From 4cca5cdb12a6c3c957163931c89a376b5f47f17b Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 9 Dec 2024 14:57:42 +0000
Subject: [PATCH 25/26] deps: update url to 2.5.4 for RUSTSEC-2024-0421
 (#10059)

## Problem

See https://rustsec.org/advisories/RUSTSEC-2024-0421

## Summary of changes

Update url crate to 2.5.4.
---
 Cargo.lock                | 287 ++++++++++++++++++++++++++++++++++----
 deny.toml                 |   1 +
 workspace_hack/Cargo.toml |   3 +
 3 files changed, 267 insertions(+), 24 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a25c7585bb..e9004748ae 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2131,9 +2131,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
 [[package]]
 name = "form_urlencoded"
-version = "1.1.0"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9c384f161156f5260c24a097c56119f9be8c798586aecc13afbcbe7b7e26bf8"
+checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
 dependencies = [
  "percent-encoding",
 ]
@@ -2754,6 +2754,124 @@ dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "icu_collections"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526"
+dependencies = [
+ "displaydoc",
+ "yoke",
+ "zerofrom",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_locid"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637"
+dependencies = [
+ "displaydoc",
+ "litemap",
+ "tinystr",
+ "writeable",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_locid_transform"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e"
+dependencies = [
+ "displaydoc",
+ "icu_locid",
+ "icu_locid_transform_data",
+ "icu_provider",
+ "tinystr",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_locid_transform_data"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e"
+
+[[package]]
+name = "icu_normalizer"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"
+dependencies = [
+ "displaydoc",
+ "icu_collections",
+ "icu_normalizer_data",
+ "icu_properties",
+ "icu_provider",
+ "smallvec",
+ "utf16_iter",
+ "utf8_iter",
+ "write16",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_normalizer_data"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"
+
+[[package]]
+name = "icu_properties"
+version = "1.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5"
+dependencies = [
+ "displaydoc",
+ "icu_collections",
+ "icu_locid_transform",
+ "icu_properties_data",
+ "icu_provider",
+ "tinystr",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_properties_data"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569"
+
+[[package]]
+name = "icu_provider"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9"
+dependencies = [
+ "displaydoc",
+ "icu_locid",
+ "icu_provider_macros",
+ "stable_deref_trait",
+ "tinystr",
+ "writeable",
+ "yoke",
+ "zerofrom",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_provider_macros"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.90",
+]
+
 [[package]]
 name = "ident_case"
 version = "1.0.1"
@@ -2762,12 +2880,23 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
 
 [[package]]
 name = "idna"
-version = "0.3.0"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6"
+checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e"
 dependencies = [
- "unicode-bidi",
- "unicode-normalization",
+ "idna_adapter",
+ "smallvec",
+ "utf8_iter",
+]
+
+[[package]]
+name = "idna_adapter"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71"
+dependencies = [
+ "icu_normalizer",
+ "icu_properties",
 ]
 
 [[package]]
@@ -3076,6 +3205,12 @@ version = "0.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f0b5399f6804fbab912acbd8878ed3532d506b7c951b8f9f164ef90fef39e3f4"
 
+[[package]]
+name = "litemap"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"
+
 [[package]]
 name = "lock_api"
 version = "0.4.10"
@@ -4044,9 +4179,9 @@ dependencies = [
 
 [[package]]
 name = "percent-encoding"
-version = "2.2.0"
+version = "2.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e"
+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
 
 [[package]]
 name = "petgraph"
@@ -4656,9 +4791,9 @@ dependencies = [
 
 [[package]]
 name = "quote"
-version = "1.0.35"
+version = "1.0.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
+checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
 dependencies = [
  "proc-macro2",
 ]
@@ -5706,9 +5841,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
 
 [[package]]
 name = "serde"
-version = "1.0.203"
+version = "1.0.215"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
+checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f"
 dependencies = [
  "serde_derive",
 ]
@@ -5725,9 +5860,9 @@ dependencies = [
 
 [[package]]
 name = "serde_derive"
-version = "1.0.203"
+version = "1.0.215"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
+checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -6469,6 +6604,16 @@ dependencies = [
  "crunchy",
 ]
 
+[[package]]
+name = "tinystr"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f"
+dependencies = [
+ "displaydoc",
+ "zerovec",
+]
+
 [[package]]
 name = "tinytemplate"
 version = "1.2.1"
@@ -6481,9 +6626,9 @@ dependencies = [
 
 [[package]]
 name = "tinyvec"
-version = "1.6.0"
+version = "1.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50"
+checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938"
 dependencies = [
  "tinyvec_macros",
 ]
@@ -6997,21 +7142,21 @@ dependencies = [
 
 [[package]]
 name = "unicode-bidi"
-version = "0.3.13"
+version = "0.3.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460"
+checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893"
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.9"
+version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0"
+checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
 
 [[package]]
 name = "unicode-normalization"
-version = "0.1.22"
+version = "0.1.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
+checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956"
 dependencies = [
  "tinyvec",
 ]
@@ -7062,9 +7207,9 @@ dependencies = [
 
 [[package]]
 name = "url"
-version = "2.3.1"
+version = "2.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d68c799ae75762b8c3fe375feb6600ef5602c883c5d21eb51c09f22b83c4643"
+checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60"
 dependencies = [
  "form_urlencoded",
  "idna",
@@ -7084,6 +7229,18 @@ version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
 
+[[package]]
+name = "utf16_iter"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
+
+[[package]]
+name = "utf8_iter"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
+
 [[package]]
 name = "utf8parse"
 version = "0.2.1"
@@ -7676,8 +7833,10 @@ dependencies = [
  "der 0.7.8",
  "deranged",
  "digest",
+ "displaydoc",
  "either",
  "fail",
+ "form_urlencoded",
  "futures-channel",
  "futures-executor",
  "futures-io",
@@ -7726,6 +7885,7 @@ dependencies = [
  "signature 2.2.0",
  "smallvec",
  "spki 0.7.3",
+ "stable_deref_trait",
  "subtle",
  "syn 2.0.90",
  "sync_wrapper 0.1.2",
@@ -7750,6 +7910,18 @@ dependencies = [
  "zstd-sys",
 ]
 
+[[package]]
+name = "write16"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
+
+[[package]]
+name = "writeable"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
+
 [[package]]
 name = "x509-certificate"
 version = "0.23.1"
@@ -7810,6 +7982,30 @@ dependencies = [
  "time",
 ]
 
+[[package]]
+name = "yoke"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40"
+dependencies = [
+ "serde",
+ "stable_deref_trait",
+ "yoke-derive",
+ "zerofrom",
+]
+
+[[package]]
+name = "yoke-derive"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.90",
+ "synstructure",
+]
+
 [[package]]
 name = "zerocopy"
 version = "0.7.31"
@@ -7831,6 +8027,27 @@ dependencies = [
  "syn 2.0.90",
 ]
 
+[[package]]
+name = "zerofrom"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e"
+dependencies = [
+ "zerofrom-derive",
+]
+
+[[package]]
+name = "zerofrom-derive"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.90",
+ "synstructure",
+]
+
 [[package]]
 name = "zeroize"
 version = "1.7.0"
@@ -7852,6 +8069,28 @@ dependencies = [
  "syn 2.0.90",
 ]
 
+[[package]]
+name = "zerovec"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079"
+dependencies = [
+ "yoke",
+ "zerofrom",
+ "zerovec-derive",
+]
+
+[[package]]
+name = "zerovec-derive"
+version = "0.10.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.90",
+]
+
 [[package]]
 name = "zstd"
 version = "0.13.0"
diff --git a/deny.toml b/deny.toml
index 7a1eecac99..ff8d71cda5 100644
--- a/deny.toml
+++ b/deny.toml
@@ -42,6 +42,7 @@ allow = [
     "MPL-2.0",
     "OpenSSL",
     "Unicode-DFS-2016",
+    "Unicode-3.0",
 ]
 confidence-threshold = 0.8
 exceptions = [
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index d19379aefd..33bdc25785 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -33,6 +33,7 @@ deranged = { version = "0.3", default-features = false, features = ["powerfmt",
 digest = { version = "0.10", features = ["mac", "oid", "std"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
+form_urlencoded = { version = "1" }
 futures-channel = { version = "0.3", features = ["sink"] }
 futures-executor = { version = "0.3" }
 futures-io = { version = "0.3" }
@@ -78,6 +79,7 @@ sha2 = { version = "0.10", features = ["asm", "oid"] }
 signature = { version = "2", default-features = false, features = ["digest", "rand_core", "std"] }
 smallvec = { version = "1", default-features = false, features = ["const_new", "write"] }
 spki = { version = "0.7", default-features = false, features = ["pem", "std"] }
+stable_deref_trait = { version = "1" }
 subtle = { version = "2" }
 sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats", "use_std"] }
@@ -105,6 +107,7 @@ anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
 cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
+displaydoc = { version = "0.2" }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 half = { version = "2", default-features = false, features = ["num-traits"] }

From e74e7aac936543e5fcb9d122bedc2d146e41ddb8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 9 Dec 2024 16:50:06 +0100
Subject: [PATCH 26/26] Use updated patched azure SDK crates (#10036)

For a while already, we've been unable to update the Azure SDK crates
due to Azure adopting use of a non-tokio async runtime, see #7545.

The effort to upstream the fix got stalled, and I think it's better to
switch to a patched version of the SDK that is up to date.

Now we have a fork of the SDK under the neondatabase github org, to
which I have applied Conrad's rebased patches to:
https://github.com/neondatabase/azure-sdk-for-rust/tree/neon .

The existence of a fork will also help with shipping bulk delete support
before it's upstreamed (#7931).

Also, in related news, the Azure SDK has gotten a rift in development,
where the main branch pertains to a future, to-be-officially-blessed
release of the SDK, and the older versions, which we are currently
using, are on the `legacy` branch. Upstream doesn't really want patches
for the `legacy` branch any more, they want to focus on the `main`
efforts. However, even then, the `legacy` branch is still newer than
what we are having right now, so let's switch to `legacy` for now.

Depending on how long it takes, we can switch to the official version of
the SDK once it's released or switch to the upstream `main` branch if
there is changes we want before that.

As a nice side effect of this PR, we now use reqwest 0.12 everywhere,
dropping the dependency on version 0.11.

Fixes #7545
---
 Cargo.lock                            | 151 +++++++-------------------
 Cargo.toml                            |  10 +-
 libs/remote_storage/src/azure_blob.rs |   8 +-
 3 files changed, 47 insertions(+), 122 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e9004748ae..e2d5e03613 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -770,77 +770,74 @@ dependencies = [
 
 [[package]]
 name = "azure_core"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70fd680c0d0424a518229b1150922f92653ba2ac933aa000abc8bf1ca08105f7"
+version = "0.21.0"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
 dependencies = [
  "async-trait",
- "base64 0.21.1",
+ "base64 0.22.1",
  "bytes",
  "dyn-clone",
  "futures",
  "getrandom 0.2.11",
  "hmac",
  "http-types",
- "log",
  "once_cell",
  "paste",
  "pin-project",
  "quick-xml 0.31.0",
  "rand 0.8.5",
- "reqwest 0.11.19",
+ "reqwest",
  "rustc_version",
  "serde",
  "serde_json",
  "sha2",
  "time",
+ "tracing",
  "url",
  "uuid",
 ]
 
 [[package]]
 name = "azure_identity"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6d2060f5b2e1c664026ca4edd561306c473be887c1f7a81f10bf06f9b71c63f"
+version = "0.21.0"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
 dependencies = [
  "async-lock",
  "async-trait",
  "azure_core",
  "futures",
- "log",
  "oauth2",
  "pin-project",
  "serde",
  "time",
+ "tokio",
+ "tracing",
  "url",
  "uuid",
 ]
 
 [[package]]
 name = "azure_storage"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "15d3da73bfa09350e1bd6ae2a260806fcf90048c7e78cd2d8f88be60b19a7266"
+version = "0.21.0"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
 dependencies = [
  "RustyXML",
  "async-lock",
  "async-trait",
  "azure_core",
  "bytes",
- "log",
  "serde",
  "serde_derive",
  "time",
+ "tracing",
  "url",
  "uuid",
 ]
 
 [[package]]
 name = "azure_storage_blobs"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "149c21834a4105d761e3dd33d91c2a3064acc05a3c978848ea8089102ae45c94"
+version = "0.21.0"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
 dependencies = [
  "RustyXML",
  "azure_core",
@@ -848,20 +845,19 @@ dependencies = [
  "azure_svc_blobstorage",
  "bytes",
  "futures",
- "log",
  "serde",
  "serde_derive",
  "serde_json",
  "time",
+ "tracing",
  "url",
  "uuid",
 ]
 
 [[package]]
 name = "azure_svc_blobstorage"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "88c888b7bf522d5405218b8613bf0fae7ddaae6ef3bf4ad42ae005993c96ab8b"
+version = "0.21.0"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
 dependencies = [
  "azure_core",
  "bytes",
@@ -1287,7 +1283,7 @@ dependencies = [
  "prometheus",
  "regex",
  "remote_storage",
- "reqwest 0.12.4",
+ "reqwest",
  "rlimit",
  "rust-ini",
  "serde",
@@ -1395,7 +1391,7 @@ dependencies = [
  "postgres_backend",
  "postgres_connection",
  "regex",
- "reqwest 0.12.4",
+ "reqwest",
  "safekeeper_api",
  "scopeguard",
  "serde",
@@ -1904,15 +1900,6 @@ dependencies = [
  "zeroize",
 ]
 
-[[package]]
-name = "encoding_rs"
-version = "0.8.32"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394"
-dependencies = [
- "cfg-if",
-]
-
 [[package]]
 name = "enum-map"
 version = "2.5.0"
@@ -3709,7 +3696,7 @@ dependencies = [
  "bytes",
  "http 1.1.0",
  "opentelemetry",
- "reqwest 0.12.4",
+ "reqwest",
 ]
 
 [[package]]
@@ -3726,7 +3713,7 @@ dependencies = [
  "opentelemetry-proto",
  "opentelemetry_sdk",
  "prost",
- "reqwest 0.12.4",
+ "reqwest",
  "thiserror",
 ]
 
@@ -3935,7 +3922,7 @@ dependencies = [
  "range-set-blaze",
  "regex",
  "remote_storage",
- "reqwest 0.12.4",
+ "reqwest",
  "rpds",
  "scopeguard",
  "send-future",
@@ -3988,7 +3975,7 @@ dependencies = [
  "postgres_ffi",
  "rand 0.8.5",
  "remote_storage",
- "reqwest 0.12.4",
+ "reqwest",
  "serde",
  "serde_json",
  "serde_with",
@@ -4008,7 +3995,7 @@ dependencies = [
  "futures",
  "pageserver_api",
  "postgres",
- "reqwest 0.12.4",
+ "reqwest",
  "serde",
  "thiserror",
  "tokio",
@@ -4725,7 +4712,7 @@ dependencies = [
  "redis",
  "regex",
  "remote_storage",
- "reqwest 0.12.4",
+ "reqwest",
  "reqwest-middleware",
  "reqwest-retry",
  "reqwest-tracing",
@@ -5088,47 +5075,6 @@ dependencies = [
  "utils",
 ]
 
-[[package]]
-name = "reqwest"
-version = "0.11.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "20b9b67e2ca7dd9e9f9285b759de30ff538aab981abaaf7bc9bd90b84a0126c3"
-dependencies = [
- "base64 0.21.1",
- "bytes",
- "encoding_rs",
- "futures-core",
- "futures-util",
- "h2 0.3.26",
- "http 0.2.9",
- "http-body 0.4.5",
- "hyper 0.14.30",
- "hyper-rustls 0.24.0",
- "ipnet",
- "js-sys",
- "log",
- "mime",
- "once_cell",
- "percent-encoding",
- "pin-project-lite",
- "rustls 0.21.12",
- "rustls-pemfile 1.0.2",
- "serde",
- "serde_json",
- "serde_urlencoded",
- "tokio",
- "tokio-rustls 0.24.0",
- "tokio-util",
- "tower-service",
- "url",
- "wasm-bindgen",
- "wasm-bindgen-futures",
- "wasm-streams 0.3.0",
- "web-sys",
- "webpki-roots 0.25.2",
- "winreg 0.50.0",
-]
-
 [[package]]
 name = "reqwest"
 version = "0.12.4"
@@ -5168,10 +5114,10 @@ dependencies = [
  "url",
  "wasm-bindgen",
  "wasm-bindgen-futures",
- "wasm-streams 0.4.0",
+ "wasm-streams",
  "web-sys",
  "webpki-roots 0.26.1",
- "winreg 0.52.0",
+ "winreg",
 ]
 
 [[package]]
@@ -5183,7 +5129,7 @@ dependencies = [
  "anyhow",
  "async-trait",
  "http 1.1.0",
- "reqwest 0.12.4",
+ "reqwest",
  "serde",
  "thiserror",
  "tower-service",
@@ -5202,7 +5148,7 @@ dependencies = [
  "http 1.1.0",
  "hyper 1.4.1",
  "parking_lot 0.11.2",
- "reqwest 0.12.4",
+ "reqwest",
  "reqwest-middleware",
  "retry-policies",
  "thiserror",
@@ -5223,7 +5169,7 @@ dependencies = [
  "http 1.1.0",
  "matchit 0.8.2",
  "opentelemetry",
- "reqwest 0.12.4",
+ "reqwest",
  "reqwest-middleware",
  "tracing",
  "tracing-opentelemetry",
@@ -5587,7 +5533,7 @@ dependencies = [
  "rand 0.8.5",
  "regex",
  "remote_storage",
- "reqwest 0.12.4",
+ "reqwest",
  "safekeeper_api",
  "scopeguard",
  "sd-notify",
@@ -5743,7 +5689,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "00421ed8fa0c995f07cde48ba6c89e80f2b312f74ff637326f392fbfd23abe02"
 dependencies = [
  "httpdate",
- "reqwest 0.12.4",
+ "reqwest",
  "rustls 0.21.12",
  "sentry-backtrace",
  "sentry-contexts",
@@ -6198,7 +6144,7 @@ dependencies = [
  "postgres_connection",
  "r2d2",
  "rand 0.8.5",
- "reqwest 0.12.4",
+ "reqwest",
  "routerify",
  "scopeguard",
  "serde",
@@ -6218,7 +6164,7 @@ name = "storage_controller_client"
 version = "0.1.0"
 dependencies = [
  "pageserver_client",
- "reqwest 0.12.4",
+ "reqwest",
  "serde",
  "workspace_hack",
 ]
@@ -6245,7 +6191,7 @@ dependencies = [
  "pageserver_api",
  "postgres_ffi",
  "remote_storage",
- "reqwest 0.12.4",
+ "reqwest",
  "rustls 0.23.18",
  "rustls-native-certs 0.8.0",
  "serde",
@@ -6274,7 +6220,7 @@ dependencies = [
  "humantime",
  "pageserver_api",
  "pageserver_client",
- "reqwest 0.12.4",
+ "reqwest",
  "serde_json",
  "storage_controller_client",
  "tokio",
@@ -7514,19 +7460,6 @@ version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
 
-[[package]]
-name = "wasm-streams"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b4609d447824375f43e1ffbc051b50ad8f4b3ae8219680c94452ea05eb240ac7"
-dependencies = [
- "futures-util",
- "js-sys",
- "wasm-bindgen",
- "wasm-bindgen-futures",
- "web-sys",
-]
-
 [[package]]
 name = "wasm-streams"
 version = "0.4.0"
@@ -7792,16 +7725,6 @@ dependencies = [
  "memchr",
 ]
 
-[[package]]
-name = "winreg"
-version = "0.50.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1"
-dependencies = [
- "cfg-if",
- "windows-sys 0.48.0",
-]
-
 [[package]]
 name = "winreg"
 version = "0.52.0"
@@ -7876,7 +7799,7 @@ dependencies = [
  "regex",
  "regex-automata 0.4.3",
  "regex-syntax 0.8.2",
- "reqwest 0.12.4",
+ "reqwest",
  "rustls 0.23.18",
  "scopeguard",
  "serde",
diff --git a/Cargo.toml b/Cargo.toml
index a35823e0c2..0654c25a3d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -51,10 +51,6 @@ anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
-azure_core = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
-azure_identity = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
-azure_storage = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
-azure_storage_blobs = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
@@ -216,6 +212,12 @@ postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git",
 postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
 
+## Azure SDK crates
+azure_core = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
+azure_identity = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_storage = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
+
 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 8d1962fa29..a1d7569140 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -8,15 +8,14 @@ use std::io;
 use std::num::NonZeroU32;
 use std::pin::Pin;
 use std::str::FromStr;
-use std::sync::Arc;
 use std::time::Duration;
 use std::time::SystemTime;
 
 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
+use anyhow::Context;
 use anyhow::Result;
 use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
 use azure_core::{Continuable, RetryOptions};
-use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::blob::CopyStatus;
 use azure_storage_blobs::prelude::ClientBuilder;
@@ -76,8 +75,9 @@ impl AzureBlobStorage {
         let credentials = if let Ok(access_key) = env::var("AZURE_STORAGE_ACCESS_KEY") {
             StorageCredentials::access_key(account.clone(), access_key)
         } else {
-            let token_credential = DefaultAzureCredential::default();
-            StorageCredentials::token_credential(Arc::new(token_credential))
+            let token_credential = azure_identity::create_default_credential()
+                .context("trying to obtain Azure default credentials")?;
+            StorageCredentials::token_credential(token_credential)
         };
 
         // we have an outer retry